ebofs/Ebofs.cc \
ebofs/Allocator.cc
libos_a_SOURCES = \
- osd/FileJournal.cc \
- osd/FakeStore.cc \
- osd/JournalingObjectStore.cc
+ os/FileJournal.cc \
+ os/FileStore.cc \
+ os/JournalingObjectStore.cc
libosd_a_SOURCES = \
osd/PG.cc \
ebofs/csum.h\
ebofs/BlockDevice.h\
ebofs/Ebofs.h\
- osd/FileJournal.h\
- osd/JournalingObjectStore.h\
+ os/FileJournal.h\
+ os/JournalingObjectStore.h\
ebofs/types.h\
ebofs/Allocator.h\
ebofs/BufferCache.h\
- osd/Journal.h\
+ os/Journal.h\
ebofs/nodes.h\
ebofs/Cnode.h\
ebofs/Onode.h\
msg/SimpleMessenger.h\
osbdb/OSBDB.h\
osd/Ager.h\
- osd/BDBMap.h\
- osd/Fake.h\
- osd/FakeStoreBDBCollections.h\
- osd/ObjectStore.h\
+ os/BDBMap.h\
+ os/Fake.h\
+ os/FakeStoreBDBCollections.h\
+ os/ObjectStore.h\
osd/ObjectVersioner.h\
osd/RAID4PG.h\
osd/ReplicatedPG.h\
osd/PG.h\
osd/OSDMap.h\
- osd/FakeStore.h\
+ os/FileStore.h\
osd/OSD.h\
osd/osd_types.h\
osdc/Blinker.h\
debug_client: 0,
debug_osd: 0,
debug_ebofs: 1,
- debug_fakestore: 1,
+ debug_filestore: 1,
debug_journal: 1,
debug_bdev: 1, // block device
debug_ns: 0,
osd_auto_weight: false,
- // --- fakestore ---
- fakestore: false,
- fakestore_sync_interval: .2, // seconds
- fakestore_fake_attrs: false,
- fakestore_fake_collections: false,
- fakestore_dev: 0,
+ // --- filestore ---
+ filestore: false,
+ filestore_sync_interval: .2, // seconds
+ filestore_fake_attrs: false,
+ filestore_fake_collections: false,
+ filestore_dev: 0,
// --- ebofs ---
ebofs: false,
g_conf.debug_ebofs = atoi(args[++i]);
else
g_debug_after_conf.debug_ebofs = atoi(args[++i]);
- else if (strcmp(args[i], "--debug_fakestore") == 0)
+ else if (strcmp(args[i], "--debug_filestore") == 0)
if (!g_conf.debug_after)
- g_conf.debug_fakestore = atoi(args[++i]);
+ g_conf.debug_filestore = atoi(args[++i]);
else
- g_debug_after_conf.debug_fakestore = atoi(args[++i]);
+ g_debug_after_conf.debug_filestore = atoi(args[++i]);
else if (strcmp(args[i], "--debug_journal") == 0)
if (!g_conf.debug_after)
g_conf.debug_journal = atoi(args[++i]);
else if (strcmp(args[i], "--journal_max_write_bytes") == 0)
g_conf.journal_max_write_bytes = atoi(args[++i]);
- else if (strcmp(args[i], "--fakestore") == 0)
- g_conf.fakestore = true;
- else if (strcmp(args[i], "--fakestore_sync_interval") == 0)
- g_conf.fakestore_sync_interval = atoi(args[++i]);
- else if (strcmp(args[i], "--fakestore_dev") == 0)
- g_conf.fakestore_dev = args[++i];
- else if (strcmp(args[i], "--fakestore_fake_attrs") == 0)
- g_conf.fakestore_fake_attrs = true;//atoi(args[++i]);
- else if (strcmp(args[i], "--fakestore_fake_collections") == 0)
- g_conf.fakestore_fake_collections = true;//atoi(args[++i]);
+ else if (strcmp(args[i], "--filestore") == 0)
+ g_conf.filestore = true;
+ else if (strcmp(args[i], "--filestore_sync_interval") == 0)
+ g_conf.filestore_sync_interval = atoi(args[++i]);
+ else if (strcmp(args[i], "--filestore_dev") == 0)
+ g_conf.filestore_dev = args[++i];
+ else if (strcmp(args[i], "--filestore_fake_attrs") == 0)
+ g_conf.filestore_fake_attrs = true;//atoi(args[++i]);
+ else if (strcmp(args[i], "--filestore_fake_collections") == 0)
+ g_conf.filestore_fake_collections = true;//atoi(args[++i]);
else if (strcmp(args[i], "--osd_balance_reads") == 0)
g_conf.osd_balance_reads = atoi(args[++i]);
int debug_client;
int debug_osd;
int debug_ebofs;
- int debug_fakestore;
+ int debug_filestore;
int debug_journal;
int debug_bdev;
int debug_ns;
bool osd_auto_weight;
- // fakestore
- bool fakestore;
- double fakestore_sync_interval;
- bool fakestore_fake_attrs;
- bool fakestore_fake_collections;
- const char *fakestore_dev;
+ // filestore
+ bool filestore;
+ double filestore_sync_interval;
+ bool filestore_fake_attrs;
+ bool filestore_fake_collections;
+ const char *filestore_dev;
// ebofs
bool ebofs;
#include <iostream>
#include "ebofs/Ebofs.h"
-#include "osd/FakeStore.h"
+#include "os/FileStore.h"
int dupstore(ObjectStore* src, ObjectStore* dst)
if (strcmp(args[0], "ebofs") == 0)
src = new Ebofs(args[1]);
- else if (strcmp(args[0], "fakestore") == 0)
- src = new FakeStore(args[1]);
+ else if (strcmp(args[0], "filestore") == 0)
+ src = new FileStore(args[1]);
else usage();
if (strcmp(args[2], "ebofs") == 0)
dst = new Ebofs(args[3]);
- else if (strcmp(args[2], "fakestore") == 0)
- dst = new FakeStore(args[3]);
+ else if (strcmp(args[2], "filestore") == 0)
+ dst = new FileStore(args[3]);
else usage();
return dupstore(src, dst);
#include "Ebofs.h"
-#include "osd/FileJournal.h"
+#include "os/FileJournal.h"
#include <errno.h>
#include "nodes.h"
#include "Allocator.h"
#include "Table.h"
-#include "osd/Journal.h"
+
+#include "os/Journal.h"
+#include "os/ObjectStore.h"
#include "common/Mutex.h"
#include "common/Cond.h"
#include "common/Finisher.h"
-#include "osd/ObjectStore.h"
-
//typedef pair<object_t,coll_t> object_coll_t;
typedef pair<coll_t,pobject_t> coll_pobject_t;
#define O_LAZY 01000000
-
+typedef __u64 coll_t;
// --------------------------------------
#include "msg/Message.h"
#include "MOSDOp.h"
-#include "osd/ObjectStore.h"
+#include "os/ObjectStore.h"
/*
* OSD op reply
#include "msg/Message.h"
#include "MOSDSubOp.h"
-#include "osd/ObjectStore.h"
+#include "os/ObjectStore.h"
/*
* OSD op reply
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __BERKELEYDB_H
+#define __BERKELEYDB_H
+
+#include <db.h>
+#include <unistd.h>
+
+#include <list>
+using namespace std;
+
+
+template<typename K, typename D>
+class BDBMap {
+ private:
+ DB *dbp;
+
+ public:
+ BDBMap() : dbp(0) {}
+ ~BDBMap() {
+ close();
+ }
+
+ bool is_open() { return dbp ? true:false; }
+
+ // open/close
+ int open(const char *fn) {
+ //cout << "open " << fn << endl;
+
+ int r;
+ if ((r = db_create(&dbp, NULL, 0)) != 0) {
+ cerr << "db_create: " << db_strerror(r) << endl;
+ assert(0);
+ }
+
+ dbp->set_errfile(dbp, stderr);
+ dbp->set_errpfx(dbp, "bdbmap");
+
+ r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644);
+ if (r != 0) {
+ dbp->err(dbp, r, "%s", fn);
+ }
+ assert(r == 0);
+ return 0;
+ }
+ void close() {
+ if (dbp) {
+ dbp->close(dbp,0);
+ dbp = 0;
+ }
+ }
+ void remove(const char *fn) {
+ if (!dbp) open(fn);
+ if (dbp) {
+ dbp->remove(dbp, fn, 0, 0);
+ dbp = 0;
+ } else {
+ ::unlink(fn);
+ }
+ }
+
+ // accessors
+ int put(K key,
+ D data) {
+ DBT k;
+ memset(&k, 0, sizeof(k));
+ k.data = &key;
+ k.size = sizeof(K);
+ DBT d;
+ memset(&d, 0, sizeof(d));
+ d.data = &data;
+ d.size = sizeof(data);
+ return dbp->put(dbp, NULL, &k, &d, 0);
+ }
+
+ int get(K key,
+ D& data) {
+ DBT k;
+ memset(&k, 0, sizeof(k));
+ k.data = &key;
+ k.size = sizeof(key);
+ DBT d;
+ memset(&d, 0, sizeof(d));
+ d.data = &data;
+ d.size = sizeof(data);
+ int r = dbp->get(dbp, NULL, &k, &d, 0);
+ return r;
+ }
+
+ int del(K key) {
+ DBT k;
+ memset(&k, 0, sizeof(k));
+ k.data = &key;
+ k.size = sizeof(key);
+ return dbp->del(dbp, NULL, &k, 0);
+ }
+
+ int list_keys(list<K>& ls) {
+ DBC *cursor = 0;
+ int r = dbp->cursor(dbp, NULL, &cursor, 0);
+ assert(r == 0);
+
+ DBT k,d;
+ memset(&k, 0, sizeof(k));
+ memset(&d, 0, sizeof(d));
+
+ while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) {
+ K key;
+ assert(k.size == sizeof(key));
+ memcpy(&key, k.data, k.size);
+ ls.push_back(key);
+ }
+ if (r != DB_NOTFOUND) {
+ dbp->err(dbp, r, "DBcursor->get");
+ assert(r == DB_NOTFOUND);
+ }
+
+ cursor->c_close(cursor);
+ return 0;
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __FAKE_H
+#define __FAKE_H
+
+#include "include/types.h"
+
+#include <list>
+#include <set>
+#include <ext/hash_map>
+using namespace std;
+using namespace __gnu_cxx;
+
+class FakeCollections {
+ private:
+ Mutex faker_lock;
+ ObjectStore *store;
+ hash_map<coll_t, set<pobject_t> > fakecollections;
+
+ public:
+ FakeCollections(ObjectStore *s) : store(s) {}
+
+ // faked collections
+ int list_collections(list<coll_t>& ls) {
+ faker_lock.Lock();
+ int r = 0;
+ for (hash_map< coll_t, set<pobject_t> >::iterator p = fakecollections.begin();
+ p != fakecollections.end();
+ p++) {
+ r++;
+ ls.push_back(p->first);
+ }
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int create_collection(coll_t c,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ fakecollections[c].size();
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return 0;
+ }
+
+ int destroy_collection(coll_t c,
+ Context *onsafe=0) {
+ int r = 0;
+ faker_lock.Lock();
+ if (fakecollections.count(c)) {
+ fakecollections.erase(c);
+ //fakecattr.erase(c);
+ if (onsafe) store->sync(onsafe);
+ } else
+ r = -1;
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int collection_stat(coll_t c, struct stat *st) {
+ return collection_exists(c) ? 0:-1;
+ }
+
+ bool collection_exists(coll_t c) {
+ faker_lock.Lock();
+ int r = fakecollections.count(c);
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int collection_add(coll_t c, pobject_t o,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ fakecollections[c].insert(o);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return 0;
+ }
+
+ int collection_remove(coll_t c, pobject_t o,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ fakecollections[c].erase(o);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return 0;
+ }
+
+ int collection_list(coll_t c, list<pobject_t>& o) {
+ faker_lock.Lock();
+ int r = 0;
+ for (set<pobject_t>::iterator p = fakecollections[c].begin();
+ p != fakecollections[c].end();
+ p++) {
+ o.push_back(*p);
+ r++;
+ }
+ faker_lock.Unlock();
+ return r;
+ }
+
+};
+
+class FakeAttrs {
+ private:
+
+ class FakeAttrSet {
+ public:
+ map<string, bufferptr> attrs;
+
+ int getattr(const char *name, void *value, size_t size) {
+ string n = name;
+ if (attrs.count(n)) {
+ size_t l = MIN( attrs[n].length(), size );
+ bufferlist bl;
+ bl.append(attrs[n]);
+ bl.copy(0, l, (char*)value);
+ return l;
+ }
+ return -1;
+ }
+ int getattrs(map<string,bufferptr>& aset) {
+ aset = attrs;
+ return 0;
+ }
+ int setattrs(map<string,bufferptr>& aset) {
+ attrs = aset;
+ return 0;
+ }
+
+ int setattr(const char *name, const void *value, size_t size) {
+ string n = name;
+ bufferptr bp = buffer::copy((char*)value, size);
+ attrs[n] = bp;
+ return 0;
+ }
+
+ int listattr(char *attrs, size_t size) {
+ assert(0);
+ return 0;
+ }
+
+ int rmattr(const char *name) {
+ string n = name;
+ attrs.erase(n);
+ return 0;
+ }
+
+ bool empty() { return attrs.empty(); }
+ };
+
+ Mutex faker_lock;
+ ObjectStore *store;
+ hash_map<pobject_t, FakeAttrSet> fakeoattrs;
+ hash_map<coll_t, FakeAttrSet> fakecattrs;
+
+ public:
+ FakeAttrs(ObjectStore *s) : store(s) {}
+
+ int setattr(pobject_t oid, const char *name,
+ const void *value, size_t size,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].setattr(name, value, size);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return r;
+ }
+ int setattrs(pobject_t oid, map<string,bufferptr>& aset) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].setattrs(aset);
+ faker_lock.Unlock();
+ return r;
+ }
+ int getattr(pobject_t oid, const char *name,
+ void *value, size_t size) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].getattr(name, value, size);
+ faker_lock.Unlock();
+ return r;
+ }
+ int getattrs(pobject_t oid, map<string,bufferptr>& aset) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].getattrs(aset);
+ faker_lock.Unlock();
+ return r;
+ }
+ int rmattr(pobject_t oid, const char *name,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].rmattr(name);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int listattr(pobject_t oid, char *attrs, size_t size) {
+ faker_lock.Lock();
+ int r = fakeoattrs[oid].listattr(attrs,size);
+ faker_lock.Unlock();
+ return r;
+ }
+
+ int collection_setattr(coll_t c, const char *name,
+ void *value, size_t size,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ int r = fakecattrs[c].setattr(name, value, size);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return r;
+ }
+ int collection_setattrs(coll_t cid, map<string,bufferptr>& aset) {
+ faker_lock.Lock();
+ int r = fakecattrs[cid].setattrs(aset);
+ faker_lock.Unlock();
+ return r;
+ }
+ int collection_getattrs(coll_t cid, map<string,bufferptr>& aset) {
+ faker_lock.Lock();
+ int r = fakecattrs[cid].getattrs(aset);
+ faker_lock.Unlock();
+ return r;
+ }
+ int collection_rmattr(coll_t c, const char *name,
+ Context *onsafe=0) {
+ faker_lock.Lock();
+ int r = fakecattrs[c].rmattr(name);
+ if (onsafe) store->sync(onsafe);
+ faker_lock.Unlock();
+ return r;
+ }
+ int collection_getattr(coll_t c, const char *name,
+ void *value, size_t size) {
+ faker_lock.Lock();
+ int r = fakecattrs[c].getattr(name, value, size);
+ faker_lock.Unlock();
+ return r;
+ }
+ int collection_listattr(coll_t c, char *attrs, size_t size) {
+ faker_lock.Lock();
+ int r = fakecattrs[c].listattr(attrs,size);
+ faker_lock.Unlock();
+ return r;
+ }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __FAKESTOREBDBCOLLECTIONS_H
+#define __FAKESTOREBDBCOLLECTIONS_H
+
+#include "BDBMap.h"
+#include "ObjectStore.h"
+#include "common/Mutex.h"
+
+#define BDBHASH_DIRS 128LL
+#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS)
+
+class FakeStoreBDBCollections {
+ private:
+ int whoami;
+ string basedir;
+
+ Mutex bdblock;
+
+ // collection dbs
+ BDBMap<coll_t, int> collections;
+ map<coll_t, BDBMap<object_t, int>*> collection_map;
+
+ // dirs
+ void get_dir(string& dir) {
+ char s[30];
+ sprintf(s, "%d", whoami);
+ dir = basedir + "/" + s;
+ }
+ void get_collfn(coll_t c, string &fn) {
+ char s[100];
+ sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c);
+ fn = basedir + "/" + s;
+ }
+
+ void open_collections() {
+ string cfn;
+ get_dir(cfn);
+ cfn += "/collections";
+ collections.open(cfn.c_str());
+ list<coll_t> ls;
+ collections.list_keys(ls);
+ }
+ void close_collections() {
+ if (collections.is_open())
+ collections.close();
+
+ for (map<coll_t, BDBMap<object_t, int>*>::iterator it = collection_map.begin();
+ it != collection_map.end();
+ it++) {
+ it->second->close();
+ }
+ collection_map.clear();
+ }
+
+ int open_collection(coll_t c) {
+ if (collection_map.count(c))
+ return 0; // already open.
+
+ string fn;
+ get_collfn(c,fn);
+ collection_map[c] = new BDBMap<coll_t,int>;
+ int r = collection_map[c]->open(fn.c_str());
+ if (r != 0)
+ collection_map.erase(c); // failed
+ return r;
+ }
+
+ public:
+ FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {}
+ ~FakeStoreBDBCollections() {
+ close_collections();
+ }
+
+ int list_collections(list<coll_t>& ls) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ ls.clear();
+ collections.list_keys(ls);
+ bdblock.Unlock();
+ return 0;
+ }
+ int create_collection(coll_t c) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ collections.put(c, 1);
+ open_collection(c);
+ bdblock.Unlock();
+ return 0;
+ }
+ int destroy_collection(coll_t c) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ collections.del(c);
+
+ open_collection(c);
+ collection_map[c]->close();
+
+ string fn;
+ get_collfn(c,fn);
+ collection_map[c]->remove(fn.c_str());
+ delete collection_map[c];
+ collection_map.erase(c);
+ bdblock.Unlock();
+ return 0;
+ }
+ int collection_stat(coll_t c, struct stat *st) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ string fn;
+ get_collfn(c,fn);
+ int r = ::stat(fn.c_str(), st);
+ bdblock.Unlock();
+ return r;
+ }
+ bool collection_exists(coll_t c) {
+ bdblock.Lock();
+ struct stat st;
+ int r = collection_stat(c, &st) == 0;
+ bdblock.Unlock();
+ return r;
+ }
+ int collection_add(coll_t c, object_t o) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ open_collection(c);
+ collection_map[c]->put(o,1);
+ bdblock.Unlock();
+ return 0;
+ }
+ int collection_remove(coll_t c, object_t o) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ open_collection(c);
+ collection_map[c]->del(o);
+ bdblock.Unlock();
+ return 0;
+ }
+ int collection_list(coll_t c, list<object_t>& o) {
+ bdblock.Lock();
+ if (!collections.is_open()) open_collections();
+
+ open_collection(c);
+ collection_map[c]->list_keys(o);
+ bdblock.Unlock();
+ return 0;
+ }
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#include "FileJournal.h"
+
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+
+#include "config.h"
+
+#define dout(x) if (x <= g_conf.debug_journal) *_dout << dbeginl << g_clock.now() << " journal "
+#define derr(x) if (x <= g_conf.debug_journal) *_derr << dbeginl << g_clock.now() << " journal "
+
+
+int FileJournal::_open(bool forwrite)
+{
+ int flags;
+
+ if (forwrite) {
+ flags = O_RDWR;
+ if (directio) flags |= O_DIRECT;
+ } else {
+ flags = O_RDONLY;
+ }
+
+ if (fd >= 0)
+ ::close(fd);
+ fd = ::open(fn.c_str(), flags);
+ if (fd < 0) {
+ dout(2) << "_open failed " << errno << " " << strerror(errno) << dendl;
+ return -errno;
+ }
+
+ // get size
+ struct stat st;
+ int r = ::fstat(fd, &st);
+ assert(r == 0);
+ max_size = st.st_size;
+ block_size = st.st_blksize;
+ dout(2) << "_open " << fn << " fd " << fd
+ << ": " << st.st_size << " bytes, block size " << block_size << dendl;
+
+ return 0;
+}
+
+int FileJournal::create()
+{
+ dout(2) << "create " << fn << dendl;
+
+ int err = _open(true);
+ if (err < 0) return err;
+
+ // write empty header
+ memset(&header, 0, sizeof(header));
+ header.clear();
+ header.fsid = fsid;
+ header.max_size = max_size;
+ header.block_size = block_size;
+ if (directio)
+ header.alignment = block_size;
+ else
+ header.alignment = 16; // at least stay word aligned on 64bit machines...
+ print_header();
+
+ buffer::ptr bp = prepare_header();
+ int r = ::pwrite(fd, bp.c_str(), bp.length(), 0);
+ if (r < 0) {
+ dout(0) << "create write header error " << errno << " " << strerror(errno) << dendl;
+ return -errno;
+ }
+
+ ::close(fd);
+ fd = -1;
+ dout(2) << "create done" << dendl;
+ return 0;
+}
+
+int FileJournal::open(epoch_t epoch)
+{
+ dout(2) << "open " << fn << dendl;
+
+ int err = _open(false);
+ if (err < 0) return err;
+
+ // assume writeable, unless...
+ read_pos = 0;
+ write_pos = get_top();
+
+ // read header?
+ read_header();
+ dout(10) << "open journal header.fsid = " << header.fsid
+ //<< " vs expected fsid = " << fsid
+ << dendl;
+ if (header.fsid != fsid) {
+ dout(2) << "open journal fsid doesn't match, invalid (someone else's?) journal" << dendl;
+ err = -EINVAL;
+ }
+ if (header.max_size > max_size) {
+ dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl;
+ err = -EINVAL;
+ }
+ if (header.block_size != block_size) {
+ dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl;
+ err = -EINVAL;
+ }
+ if (header.alignment != block_size && directio) {
+ derr(0) << "open journal alignment " << header.alignment << " does not match block size "
+ << block_size << " (required for direct_io journal mode)" << dendl;
+ err = -EINVAL;
+ }
+ if (err)
+ return err;
+
+ // looks like a valid header.
+ write_pos = 0; // not writeable yet
+ read_pos = 0;
+
+ if (header.num > 0) {
+ // pick an offset
+ for (int i=0; i<header.num; i++) {
+ if (header.epoch[i] == epoch) {
+ dout(2) << "using read_pos header pointer "
+ << header.epoch[i] << " at " << header.offset[i]
+ << dendl;
+ read_pos = header.offset[i];
+ write_pos = 0;
+ break;
+ }
+ else if (header.epoch[i] < epoch) {
+ dout(2) << "super_epoch is " << epoch
+ << ", skipping old " << header.epoch[i] << " at " << header.offset[i]
+ << dendl;
+ }
+ else if (header.epoch[i] > epoch) {
+ dout(2) << "super_epoch is " << epoch
+ << ", but wtf, journal is later " << header.epoch[i] << " at " << header.offset[i]
+ << dendl;
+ break;
+ }
+ }
+
+ if (read_pos == 0) {
+ dout(0) << "no valid journal segments" << dendl;
+ return 0; //hrm return -EINVAL;
+ }
+
+ } else {
+ dout(0) << "journal was empty" << dendl;
+ read_pos = -1;
+ }
+
+ return 0;
+}
+
+void FileJournal::close()
+{
+ dout(1) << "close " << fn << dendl;
+
+ // stop writer thread
+ stop_writer();
+
+ // close
+ assert(writeq.empty());
+ assert(commitq.empty());
+ assert(fd > 0);
+ ::close(fd);
+ fd = -1;
+}
+
+void FileJournal::start_writer()
+{
+ write_stop = false;
+ write_thread.create();
+}
+
+void FileJournal::stop_writer()
+{
+ write_lock.Lock();
+ {
+ write_stop = true;
+ write_cond.Signal();
+ }
+ write_lock.Unlock();
+ write_thread.join();
+}
+
+
+void FileJournal::print_header()
+{
+ for (int i=0; i<header.num; i++) {
+ if (i && header.offset[i] < header.offset[i-1]) {
+ assert(header.wrap);
+ dout(10) << "header: wrap at " << header.wrap << dendl;
+ }
+ dout(10) << "header: epoch " << header.epoch[i] << " at " << header.offset[i] << dendl;
+ }
+ //if (header.wrap) dout(10) << "header: wrap at " << header.wrap << dendl;
+}
+
+void FileJournal::read_header()
+{
+ int r;
+ dout(10) << "read_header" << dendl;
+ if (directio) {
+ buffer::ptr bp = buffer::create_page_aligned(block_size);
+ bp.zero();
+ r = ::pread(fd, bp.c_str(), bp.length(), 0);
+ memcpy(&header, bp.c_str(), sizeof(header));
+ } else {
+ memset(&header, 0, sizeof(header)); // zero out (read may fail)
+ r = ::pread(fd, &header, sizeof(header), 0);
+ }
+ if (r < 0)
+ dout(0) << "read_header error " << errno << " " << strerror(errno) << dendl;
+ print_header();
+}
+
+bufferptr FileJournal::prepare_header()
+{
+ bufferptr bp;
+ if (directio) {
+ bp = buffer::create_page_aligned(block_size);
+ bp.zero();
+ memcpy(bp.c_str(), &header, sizeof(header));
+ } else {
+ bp = buffer::create(sizeof(header));
+ memcpy(bp.c_str(), &header, sizeof(header));
+ }
+ return bp;
+}
+
+
+
+
+void FileJournal::check_for_wrap(epoch_t epoch, off64_t pos, off64_t size)
+{
+ // epoch boundary?
+ dout(10) << "check_for_wrap epoch " << epoch << " last " << header.last_epoch() << " of " << header.num << dendl;
+ if (epoch > header.last_epoch()) {
+ dout(10) << "saw an epoch boundary " << header.last_epoch() << " -> " << epoch << dendl;
+ header.push(epoch, pos);
+ must_write_header = true;
+ }
+
+ // does it fit?
+ if (header.wrap) {
+ // we're wrapped. don't overwrite ourselves.
+ if (pos + size >= header.offset[0]) {
+ dout(10) << "JOURNAL FULL (and wrapped), " << pos << "+" << size
+ << " >= " << header.offset[0]
+ << dendl;
+ full = true;
+ writeq.clear();
+ print_header();
+ }
+ } else {
+ // we haven't wrapped.
+ if (pos + size >= header.max_size) {
+ // is there room if we wrap?
+ if (get_top() + size < header.offset[0]) {
+ // yes!
+ dout(10) << "wrapped from " << pos << " to " << get_top() << dendl;
+ header.wrap = pos;
+ pos = get_top();
+ header.push(epoch, pos);
+ must_write_header = true;
+ } else {
+ // no room.
+ dout(10) << "submit_entry JOURNAL FULL (and can't wrap), " << pos << "+" << size
+ << " >= " << header.max_size
+ << dendl;
+ full = true;
+ writeq.clear();
+ }
+ }
+ }
+}
+
+
+void FileJournal::prepare_multi_write(bufferlist& bl)
+{
+ // gather queued writes
+ off64_t queue_pos = write_pos;
+
+ int eleft = g_conf.journal_max_write_entries;
+ int bleft = g_conf.journal_max_write_bytes;
+
+ while (!writeq.empty()) {
+ // grab next item
+ epoch_t epoch = writeq.front().first;
+ bufferlist &ebl = writeq.front().second;
+ off64_t size = 2*sizeof(entry_header_t) + ebl.length();
+
+ if (bl.length() > 0 && bleft > 0 && bleft < size) break;
+
+ check_for_wrap(epoch, queue_pos, size);
+ if (full) break;
+ if (bl.length() && must_write_header)
+ break;
+
+ // add to write buffer
+ dout(15) << "prepare_multi_write will write " << queue_pos << " : "
+ << ebl.length() << " epoch " << epoch << " -> " << size << dendl;
+
+ // add it this entry
+ entry_header_t h;
+ h.epoch = epoch;
+ h.len = ebl.length();
+ h.make_magic(queue_pos, header.fsid);
+ bl.append((const char*)&h, sizeof(h));
+ bl.claim_append(ebl);
+ bl.append((const char*)&h, sizeof(h));
+
+ Context *oncommit = commitq.front();
+ if (oncommit)
+ writingq.push_back(oncommit);
+
+ // pop from writeq
+ writeq.pop_front();
+ commitq.pop_front();
+
+ queue_pos += size;
+ if (--eleft == 0) break;
+ bleft -= size;
+ if (bleft == 0) break;
+ }
+}
+
+bool FileJournal::prepare_single_dio_write(bufferlist& bl)
+{
+ // grab next item
+ epoch_t epoch = writeq.front().first;
+ bufferlist &ebl = writeq.front().second;
+
+ off64_t size = 2*sizeof(entry_header_t) + ebl.length();
+ size = ROUND_UP_TO(size, header.alignment);
+
+ check_for_wrap(epoch, write_pos, size);
+ if (full) return false;
+
+ // build it
+ dout(15) << "prepare_single_dio_write will write " << write_pos << " : "
+ << ebl.length() << " epoch " << epoch << " -> " << size << dendl;
+
+ bufferptr bp = buffer::create_page_aligned(size);
+ entry_header_t *h = (entry_header_t*)bp.c_str();
+ h->epoch = epoch;
+ h->len = ebl.length();
+ h->make_magic(write_pos, header.fsid);
+ ebl.copy(0, ebl.length(), bp.c_str()+sizeof(*h));
+ memcpy(bp.c_str() + sizeof(*h) + ebl.length(), h, sizeof(*h));
+ bl.push_back(bp);
+
+ Context *oncommit = commitq.front();
+ if (oncommit)
+ writingq.push_back(oncommit);
+
+ // pop from writeq
+ writeq.pop_front();
+ commitq.pop_front();
+ return true;
+}
+
+void FileJournal::do_write(bufferlist& bl)
+{
+ // nothing to do?
+ if (bl.length() == 0 && !must_write_header)
+ return;
+
+ buffer::ptr hbp;
+ if (must_write_header)
+ hbp = prepare_header();
+
+ writing = true;
+
+ header_t old_header = header;
+
+ write_lock.Unlock();
+
+ dout(15) << "do_write writing " << write_pos << "~" << bl.length()
+ << (must_write_header ? " + header":"")
+ << dendl;
+
+ // header
+ if (hbp.length())
+ ::pwrite(fd, hbp.c_str(), hbp.length(), 0);
+
+ // entry
+#ifdef DARWIN
+ off_t pos = write_pos;
+ ::lseek(fd, write_pos, SEEK_SET);
+#else
+ off64_t pos = write_pos;
+ ::lseek64(fd, write_pos, SEEK_SET);
+#endif
+ for (list<bufferptr>::const_iterator it = bl.buffers().begin();
+ it != bl.buffers().end();
+ it++) {
+ if ((*it).length() == 0) continue; // blank buffer.
+ int r = ::write(fd, (char*)(*it).c_str(), (*it).length());
+ if (r < 0)
+ derr(0) << "do_write failed with " << errno << " " << strerror(errno)
+ << " with " << (void*)(*it).c_str() << " len " << (*it).length()
+ << dendl;
+ pos += (*it).length();
+ }
+#ifdef DARWIN
+ if (!directio)
+ ::fsync(fd);
+#else
+ if (!directio)
+ ::fdatasync(fd);
+#endif
+
+
+ write_lock.Lock();
+
+ writing = false;
+ if (memcmp(&old_header, &header, sizeof(header)) == 0) {
+ write_pos += bl.length();
+ write_pos = ROUND_UP_TO(write_pos, header.alignment);
+ finisher->queue(writingq);
+ } else {
+ dout(10) << "do_write finished write but header changed? not moving write_pos." << dendl;
+ derr(0) << "do_write finished write but header changed? not moving write_pos." << dendl;
+ assert(writingq.empty());
+ }
+}
+
+
+void FileJournal::write_thread_entry()
+{
+ dout(10) << "write_thread_entry start" << dendl;
+ write_lock.Lock();
+
+ while (!write_stop) {
+ if (writeq.empty()) {
+ // sleep
+ dout(20) << "write_thread_entry going to sleep" << dendl;
+ write_cond.Wait(write_lock);
+ dout(20) << "write_thread_entry woke up" << dendl;
+ continue;
+ }
+
+ bufferlist bl;
+ must_write_header = false;
+ if (directio)
+ prepare_single_dio_write(bl);
+ else
+ prepare_multi_write(bl);
+ do_write(bl);
+ }
+
+ write_lock.Unlock();
+ dout(10) << "write_thread_entry finish" << dendl;
+}
+
+
+bool FileJournal::is_full()
+{
+ Mutex::Locker locker(write_lock);
+ return full;
+}
+
+void FileJournal::submit_entry(epoch_t epoch, bufferlist& e, Context *oncommit)
+{
+ Mutex::Locker locker(write_lock); // ** lock **
+
+ // dump on queue
+ dout(10) << "submit_entry " << e.length()
+ << " epoch " << epoch
+ << " " << oncommit << dendl;
+ commitq.push_back(oncommit);
+ if (!full) {
+ writeq.push_back(pair<epoch_t,bufferlist>(epoch, e));
+ write_cond.Signal(); // kick writer thread
+ }
+}
+
+
+void FileJournal::commit_epoch_start(epoch_t new_epoch)
+{
+ dout(10) << "commit_epoch_start on " << new_epoch-1
+ << " -- new epoch " << new_epoch
+ << dendl;
+
+ Mutex::Locker locker(write_lock);
+
+ // was full -> empty -> now usable?
+ if (full) {
+ if (header.num != 0) {
+ dout(1) << " journal FULL, ignoring this epoch" << dendl;
+ return;
+ }
+
+ dout(1) << " clearing FULL flag, journal now usable" << dendl;
+ full = false;
+ }
+}
+
+void FileJournal::commit_epoch_finish(epoch_t new_epoch)
+{
+ dout(10) << "commit_epoch_finish committed " << (new_epoch-1) << dendl;
+
+ Mutex::Locker locker(write_lock);
+
+ if (full) {
+ // full journal damage control.
+ dout(15) << " journal was FULL, contents now committed, clearing header. journal still not usable until next epoch." << dendl;
+ header.clear();
+ write_pos = get_top();
+ } else {
+ // update header -- trim/discard old (committed) epochs
+ print_header();
+ while (header.num && header.epoch[0] < new_epoch) {
+ dout(10) << " popping epoch " << header.epoch[0] << " < " << new_epoch << dendl;
+ header.pop();
+ }
+ if (header.num == 0) {
+ dout(10) << " starting fresh" << dendl;
+ write_pos = get_top();
+ header.push(new_epoch, write_pos);
+ }
+ }
+ must_write_header = true;
+
+ // discard any unwritten items in previous epoch
+ while (!writeq.empty() && writeq.front().first < new_epoch) {
+ dout(15) << " dropping unwritten and committed "
+ << write_pos << " : " << writeq.front().second.length()
+ << " epoch " << writeq.front().first
+ << dendl;
+ // finisher?
+ Context *oncommit = commitq.front();
+ if (oncommit) writingq.push_back(oncommit);
+
+ // discard.
+ writeq.pop_front();
+ commitq.pop_front();
+ }
+
+ // queue the finishers
+ finisher->queue(writingq);
+ dout(10) << "commit_epoch_finish done" << dendl;
+}
+
+
+void FileJournal::make_writeable()
+{
+ _open(true);
+
+ if (read_pos > 0)
+ write_pos = read_pos;
+ else
+ write_pos = get_top();
+ read_pos = 0;
+
+ must_write_header = true;
+ start_writer();
+}
+
+
+bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch)
+{
+ if (!read_pos) {
+ dout(2) << "read_entry -- not readable" << dendl;
+ return false;
+ }
+
+ if (read_pos == header.wrap) {
+ // find wrap point
+ for (int i=1; i<header.num; i++) {
+ if (header.offset[i] < read_pos) {
+ assert(header.offset[i-1] < read_pos);
+ read_pos = header.offset[i];
+ break;
+ }
+ }
+ assert(read_pos != header.wrap);
+ dout(10) << "read_entry wrapped from " << header.wrap << " to " << read_pos << dendl;
+ }
+
+ // header
+ entry_header_t h;
+#ifdef DARWIN
+ ::lseek(fd, read_pos, SEEK_SET);
+#else
+ ::lseek64(fd, read_pos, SEEK_SET);
+#endif
+ ::read(fd, &h, sizeof(h));
+ if (!h.check_magic(read_pos, header.fsid)) {
+ dout(2) << "read_entry " << read_pos << " : bad header magic, end of journal" << dendl;
+ return false;
+ }
+
+ // body
+ bufferptr bp(h.len);
+ ::read(fd, bp.c_str(), h.len);
+
+ // footer
+ entry_header_t f;
+ ::read(fd, &f, sizeof(h));
+ if (!f.check_magic(read_pos, header.fsid) ||
+ h.epoch != f.epoch ||
+ h.len != f.len) {
+ dout(2) << "read_entry " << read_pos << " : bad footer magic, partial entry, end of journal" << dendl;
+ return false;
+ }
+
+
+ // yay!
+ dout(1) << "read_entry " << read_pos << " : "
+ << " " << h.len << " bytes"
+ << " epoch " << h.epoch
+ << dendl;
+
+ bl.push_back(bp);
+ epoch = h.epoch;
+
+ read_pos += 2*sizeof(entry_header_t) + h.len;
+ read_pos = ROUND_UP_TO(read_pos, header.alignment);
+
+ return true;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_FILEJOURNAL_H
+#define __EBOFS_FILEJOURNAL_H
+
+
+#include "Journal.h"
+#include "common/Cond.h"
+#include "common/Mutex.h"
+#include "common/Thread.h"
+
+class FileJournal : public Journal {
+public:
+ /** log header
+ * we allow 4 pointers:
+ * top/initial,
+ * one for an epoch boundary (if any),
+ * one for a wrap in the ring buffer/journal file,
+ * one for a second epoch boundary (if any).
+ * the epoch boundary one is useful only for speedier recovery in certain cases
+ * (i.e. when ebofs committed, but the journal didn't rollover ... very small window!)
+ */
+ struct header_t {
+ __u64 fsid;
+ __s64 num;
+ __u32 block_size;
+ __u32 alignment;
+ __s64 max_size;
+ __s64 wrap;
+ __u32 epoch[4];
+ __s64 offset[4];
+
+ header_t() : fsid(0), num(0), block_size(0), alignment(0), max_size(0), wrap(0) {}
+
+ void clear() {
+ num = 0;
+ wrap = 0;
+ }
+ void pop() {
+ if (num >= 2 && offset[0] > offset[1])
+ wrap = 0; // we're eliminating a wrap
+ num--;
+ for (int i=0; i<num; i++) {
+ epoch[i] = epoch[i+1];
+ offset[i] = offset[i+1];
+ }
+ }
+ void push(epoch_t e, off64_t o) {
+ assert(num < 4);
+ if (num > 2 &&
+ epoch[num-1] == e &&
+ epoch[num-2] == (e-1))
+ num--; // tail was an epoch boundary; replace it.
+ epoch[num] = e;
+ offset[num] = o;
+ num++;
+ }
+ epoch_t last_epoch() {
+ if (num)
+ return epoch[num-1];
+ else
+ return 0;
+ }
+ } header;
+
+ struct entry_header_t {
+ uint64_t epoch;
+ uint64_t len;
+ uint64_t magic1;
+ uint64_t magic2;
+
+ void make_magic(off64_t pos, uint64_t fsid) {
+ magic1 = pos;
+ magic2 = fsid ^ epoch ^ len;
+ }
+ bool check_magic(off64_t pos, uint64_t fsid) {
+ return
+ magic1 == (uint64_t)pos &&
+ magic2 == (fsid ^ epoch ^ len);
+ }
+ };
+
+private:
+ string fn;
+
+ off64_t max_size;
+ size_t block_size;
+ bool directio;
+ bool full, writing, must_write_header;
+ off64_t write_pos; // byte where next entry written goes
+ off64_t read_pos; //
+
+ int fd;
+
+ // to be journaled
+ list<pair<epoch_t,bufferlist> > writeq;
+ list<Context*> commitq;
+
+ // being journaled
+ list<Context*> writingq;
+
+ // write thread
+ Mutex write_lock;
+ Cond write_cond;
+ bool write_stop;
+
+ int _open(bool wr);
+ void print_header();
+ void read_header();
+ bufferptr prepare_header();
+ void start_writer();
+ void stop_writer();
+ void write_thread_entry();
+
+ void check_for_wrap(epoch_t epoch, off64_t pos, off64_t size);
+ bool prepare_single_dio_write(bufferlist& bl);
+ void prepare_multi_write(bufferlist& bl);
+ void do_write(bufferlist& bl);
+
+ class Writer : public Thread {
+ FileJournal *journal;
+ public:
+ Writer(FileJournal *fj) : journal(fj) {}
+ void *entry() {
+ journal->write_thread_entry();
+ return 0;
+ }
+ } write_thread;
+
+ off64_t get_top() {
+ if (directio)
+ return block_size;
+ else
+ return sizeof(header);
+ }
+
+ public:
+ FileJournal(__u64 fsid, Finisher *fin, const char *f, bool dio=false) :
+ Journal(fsid, fin), fn(f),
+ max_size(0), block_size(0),
+ directio(dio),
+ full(false), writing(false), must_write_header(false),
+ write_pos(0), read_pos(0),
+ fd(-1),
+ write_stop(false), write_thread(this) { }
+ ~FileJournal() {}
+
+ int create();
+ int open(epoch_t epoch);
+ void close();
+
+ bool is_writeable() {
+ return read_pos == 0;
+ }
+ void make_writeable();
+
+ // writes
+ void submit_entry(epoch_t epoch, bufferlist& e, Context *oncommit); // submit an item
+ void commit_epoch_start(epoch_t); // mark epoch boundary
+ void commit_epoch_finish(epoch_t); // mark prior epoch as committed (we can expire)
+
+ bool read_entry(bufferlist& bl, epoch_t& e);
+
+ bool is_full();
+
+ // reads
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+
+#include "FileStore.h"
+#include "include/types.h"
+
+#include "FileJournal.h"
+
+#include "common/Timer.h"
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <sys/file.h>
+#include <iostream>
+#include <cassert>
+#include <errno.h>
+#include <dirent.h>
+#include <sys/ioctl.h>
+#ifndef __CYGWIN__
+# include <sys/xattr.h>
+#endif
+
+#ifdef DARWIN
+#include <sys/param.h>
+#include <sys/mount.h>
+#endif // DARWIN
+
+
+#ifndef __CYGWIN__
+#ifndef DARWIN
+# include <linux/ioctl.h>
+# define BTRFS_IOCTL_MAGIC 0x94
+# define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
+# define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
+# define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
+# define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
+#endif
+#endif
+
+
+#include "config.h"
+
+#define dout(l) if (l<=g_conf.debug_filestore) *_dout << dbeginl << g_clock.now() << " filestore(" << basedir << ") "
+#define derr(l) if (l<=g_conf.debug_filestore) *_derr << dbeginl << g_clock.now() << " filestore(" << basedir << ") "
+
+#include "include/buffer.h"
+
+#include <map>
+
+
+/*
+ * xattr portability stupidity
+ */
+
+#ifdef DARWIN
+int do_getxattr(const char *fn, const char *name, void *val, size_t size) {
+ return ::getxattr(fn, name, val, size, 0, 0);
+}
+int do_setxattr(const char *fn, const char *name, const void *val, size_t size) {
+ return ::setxattr(fn, name, val, size, 0, 0);
+}
+int do_removexattr(const char *fn, const char *name) {
+ return ::removexattr(fn, name, 0);
+}
+int do_listxattr(const char *fn, char *names, size_t len) {
+ return ::listxattr(fn, names, len, 0);
+}
+#else
+int do_getxattr(const char *fn, const char *name, void *val, size_t size) {
+ return ::getxattr(fn, name, val, size);
+}
+int do_setxattr(const char *fn, const char *name, const void *val, size_t size) {
+ return ::setxattr(fn, name, val, size, 0);
+}
+int do_removexattr(const char *fn, const char *name) {
+ return ::removexattr(fn, name);
+}
+int do_listxattr(const char *fn, char *names, size_t len) {
+ return ::listxattr(fn, names, len);
+}
+
+#endif
+
+
+
+
+int FileStore::statfs(struct statfs *buf)
+{
+ if (::statfs(basedir.c_str(), buf) < 0)
+ return -errno;
+ return 0;
+}
+
+
+/*
+ * sorry, these are sentitive to the pobject_t and coll_t typing.
+ */
+void FileStore::get_oname(pobject_t oid, char *s)
+{
+ assert(sizeof(oid) == 24);
+#ifdef __LP64__
+ sprintf(s, "%s/objects/%04x.%04x.%016lx.%016lx", basedir.c_str(),
+ oid.volume, oid.rank,
+ *((uint64_t*)&oid.oid),
+ *(((uint64_t*)&oid.oid) + 1));
+#else
+ sprintf(s, "%s/objects/%04x.%04x.%016llx.%016llx", basedir.c_str(),
+ oid.volume, oid.rank,
+ *((uint64_t*)&oid.oid),
+ *(((uint64_t*)&oid.oid) + 1));
+#endif
+}
+
+pobject_t FileStore::parse_object(char *s)
+{
+ pobject_t o;
+ assert(sizeof(o) == 24);
+ //cout << " got object " << de->d_name << std::endl;
+ o.volume = strtoll(s, 0, 16);
+ assert(s[4] == '.');
+ o.rank = strtoll(s+5, 0, 16);
+ assert(s[9] == '.');
+ *(((uint64_t*)&o.oid) + 0) = strtoll(s+10, 0, 16);
+ assert(s[26] == '.');
+ *(((uint64_t*)&o.oid) + 1) = strtoll(s+27, 0, 16);
+ dout(0) << " got " << o << " errno " << errno << " on " << s << dendl;
+ return o;
+}
+
+coll_t FileStore::parse_coll(char *s)
+{
+ return strtoll(s, 0, 16);
+}
+
+void FileStore::get_cdir(coll_t cid, char *s)
+{
+ assert(sizeof(cid) == 8);
+#ifdef __LP64__
+ sprintf(s, "%s/collections/%016lx", basedir.c_str(),
+ cid);
+#else
+ sprintf(s, "%s/collections/%016llx", basedir.c_str(),
+ cid);
+#endif
+}
+
+void FileStore::get_coname(coll_t cid, pobject_t oid, char *s)
+{
+ assert(sizeof(oid) == 24);
+#ifdef __LP64__
+ sprintf(s, "%s/collections/%016lx/%04x.%04x.%016lx.%016lx", basedir.c_str(), cid,
+ oid.volume, oid.rank,
+ *((uint64_t*)&oid.oid),
+ *(((uint64_t*)&oid.oid) + 1));
+#else
+ sprintf(s, "%s/collections/%016llx/%04x.%04x.%016llx.%016llx", basedir.c_str(), cid,
+ oid.volume, oid.rank,
+ *((uint64_t*)&oid),
+ *(((uint64_t*)&oid) + 1));
+#endif
+}
+
+
+
+
+int FileStore::mkfs()
+{
+ char cmd[200];
+ if (g_conf.filestore_dev) {
+ dout(0) << "mounting" << dendl;
+ sprintf(cmd,"mount %s", g_conf.filestore_dev);
+ system(cmd);
+ }
+
+ dout(1) << "mkfs in " << basedir << dendl;
+
+ // wipe
+ sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects",
+ basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str());
+
+ dout(5) << "wipe: " << cmd << dendl;
+ system(cmd);
+
+ // fsid
+ fsid = rand();
+ char fn[100];
+ sprintf(fn, "%s/fsid", basedir.c_str());
+ int fd = ::open(fn, O_CREAT|O_TRUNC|O_WRONLY, 0644);
+ ::write(fd, &fsid, sizeof(fsid));
+ ::close(fd);
+ dout(10) << "mkfs fsid is " << fsid << dendl;
+
+ // journal?
+ struct stat st;
+ sprintf(fn, "%s.journal", basedir.c_str());
+ if (::lstat(fn, &st) == 0) {
+ journal = new FileJournal(fsid, &finisher, fn, g_conf.journal_dio);
+ if (journal->create() < 0) {
+ dout(0) << "mkfs error creating journal on " << fn << dendl;
+ } else {
+ dout(0) << "mkfs created journal on " << fn << dendl;
+ }
+ delete journal;
+ journal = 0;
+ } else {
+ dout(10) << "mkfs no journal at " << fn << dendl;
+ }
+
+ if (g_conf.filestore_dev) {
+ char cmd[100];
+ dout(0) << "umounting" << dendl;
+ sprintf(cmd,"umount %s", g_conf.filestore_dev);
+ //system(cmd);
+ }
+
+ dout(1) << "mkfs done in " << basedir << dendl;
+
+ return 0;
+}
+
+int FileStore::mount()
+{
+ if (g_conf.filestore_dev) {
+ dout(0) << "mounting" << dendl;
+ char cmd[100];
+ sprintf(cmd,"mount %s", g_conf.filestore_dev);
+ //system(cmd);
+ }
+
+ dout(5) << "basedir " << basedir << dendl;
+
+ // make sure global base dir exists
+ struct stat st;
+ int r = ::stat(basedir.c_str(), &st);
+ if (r != 0) {
+ derr(0) << "unable to stat basedir " << basedir << ", " << strerror(errno) << dendl;
+ return -errno;
+ }
+
+ if (g_conf.filestore_fake_collections) {
+ dout(0) << "faking collections (in memory)" << dendl;
+ fake_collections = true;
+ }
+
+ // fake attrs?
+ // let's test to see if they work.
+ if (g_conf.filestore_fake_attrs) {
+ dout(0) << "faking attrs (in memory)" << dendl;
+ fake_attrs = true;
+ } else {
+ char names[1000];
+ r = do_listxattr(basedir.c_str(), names, 1000);
+ if (r < 0) {
+ derr(0) << "xattrs don't appear to work (" << strerror(errno) << "), specify --filestore_fake_attrs to fake them (in memory)." << dendl;
+ assert(0);
+ }
+ }
+
+ char fn[100];
+ int fd;
+
+#ifdef BTRFS_IOC_SYNC
+ // is this btrfs?
+ btrfs_fd = ::open(basedir.c_str(), O_DIRECTORY);
+ r = ::ioctl(btrfs_fd, BTRFS_IOC_SYNC);
+ if (r == 0) {
+ dout(0) << "mount detected btrfs" << dendl;
+ } else {
+ dout(0) << "mount did NOT detect btrfs: " << strerror(-r) << dendl;
+ ::close(btrfs_fd);
+ btrfs_fd = -1;
+ }
+#endif
+
+ // get fsid
+ sprintf(fn, "%s/fsid", basedir.c_str());
+ fd = ::open(fn, O_RDONLY);
+ ::read(fd, &fsid, sizeof(fsid));
+ ::close(fd);
+ dout(10) << "mount fsid is " << fsid << dendl;
+
+ // get epoch
+ sprintf(fn, "%s/commit_epoch", basedir.c_str());
+ fd = ::open(fn, O_RDONLY);
+ ::read(fd, &super_epoch, sizeof(super_epoch));
+ ::close(fd);
+ dout(5) << "mount epoch is " << super_epoch << dendl;
+
+ // journal
+ sprintf(fn, "%s.journal", basedir.c_str());
+ if (::stat(fn, &st) == 0) {
+ dout(10) << "mount opening journal at " << fn << dendl;
+ journal = new FileJournal(fsid, &finisher, fn, g_conf.journal_dio);
+ } else {
+ dout(10) << "mount no journal at " << fn << dendl;
+ }
+ r = journal_replay();
+ if (r == -EINVAL) {
+ dout(0) << "mount got EINVAL on journal open, not mounting" << dendl;
+ return r;
+ }
+ journal_start();
+ sync_thread.create();
+
+ // all okay.
+ return 0;
+}
+
+int FileStore::umount()
+{
+ dout(5) << "umount " << basedir << dendl;
+
+ sync();
+ journal_stop();
+
+ lock.Lock();
+ stop = true;
+ sync_cond.Signal();
+ lock.Unlock();
+ sync_thread.join();
+
+ if (btrfs_fd >= 0) {
+ ::close(btrfs_fd);
+ btrfs_fd = -1;
+ }
+
+ if (g_conf.filestore_dev) {
+ char cmd[100];
+ dout(0) << "umounting" << dendl;
+ sprintf(cmd,"umount %s", g_conf.filestore_dev);
+ //system(cmd);
+ }
+
+ // nothing
+ return 0;
+}
+
+
+int FileStore::transaction_start()
+{
+ if (btrfs_fd < 0)
+ return 0;
+
+ int fd = ::open(basedir.c_str(), O_RDONLY);
+ if (fd < 0)
+ derr(0) << "transaction_start got " << strerror(errno)
+ << " from btrfs open" << dendl;
+ if (::ioctl(fd, BTRFS_IOC_TRANS_START) < 0) {
+ derr(0) << "transaction_start got " << strerror(errno)
+ << " from btrfs ioctl" << dendl;
+ ::close(fd);
+ return -errno;
+ }
+ dout(10) << "transaction_start " << fd << dendl;
+ return fd;
+}
+
+void FileStore::transaction_end(int fd)
+{
+ if (btrfs_fd < 0)
+ return;
+ dout(10) << "transaction_end " << fd << dendl;
+ ::close(fd);
+}
+
+
+// --------------------
+// objects
+
+bool FileStore::exists(pobject_t oid)
+{
+ struct stat st;
+ if (stat(oid, &st) == 0)
+ return true;
+ else
+ return false;
+}
+
+int FileStore::stat(pobject_t oid, struct stat *st)
+{
+ dout(20) << "stat " << oid << dendl;
+ char fn[200];
+ get_oname(oid,fn);
+ int r = ::stat(fn, st);
+ dout(20) << "stat " << oid << " at " << fn << " = " << r << dendl;
+ return r < 0 ? -errno:r;
+}
+
+
+int FileStore::remove(pobject_t oid, Context *onsafe)
+{
+ dout(20) << "remove " << oid << dendl;
+ char fn[200];
+ get_oname(oid,fn);
+ int r = ::unlink(fn);
+ if (r == 0)
+ journal_remove(oid, onsafe);
+ else
+ delete onsafe;
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::truncate(pobject_t oid, off_t size, Context *onsafe)
+{
+ dout(20) << "truncate " << oid << " size " << size << dendl;
+
+ char fn[200];
+ get_oname(oid,fn);
+ int r = ::truncate(fn, size);
+ if (r >= 0) journal_truncate(oid, size, onsafe);
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::read(pobject_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl) {
+ dout(20) << "read " << oid << " len " << len << " off " << offset << dendl;
+
+ char fn[200];
+ get_oname(oid,fn);
+
+ int fd = ::open(fn, O_RDONLY);
+ if (fd < 0) {
+ dout(10) << "read couldn't open " << fn << " errno " << errno << " " << strerror(errno) << dendl;
+ return -errno;
+ }
+ ::flock(fd, LOCK_EX); // lock for safety
+
+ off_t actual = lseek(fd, offset, SEEK_SET);
+ size_t got = 0;
+
+ if (len == 0) {
+ struct stat st;
+ ::fstat(fd, &st);
+ len = st.st_size;
+ }
+
+ if (actual == offset) {
+ bufferptr bptr(len); // prealloc space for entire read
+ got = ::read(fd, bptr.c_str(), len);
+ bptr.set_length(got); // properly size the buffer
+ if (got > 0) bl.push_back( bptr ); // put it in the target bufferlist
+ }
+ ::flock(fd, LOCK_UN);
+ ::close(fd);
+ return got;
+}
+
+
+int FileStore::write(pobject_t oid,
+ off_t offset, size_t len,
+ const bufferlist& bl,
+ Context *onsafe)
+{
+ char fn[200];
+ get_oname(oid,fn);
+
+ dout(20) << "write " << fn << " len " << len << " off " << offset << dendl;
+
+ int flags = O_WRONLY|O_CREAT;
+ int fd = ::open(fn, flags, 0644);
+ if (fd < 0) {
+ derr(0) << "write couldn't open " << fn << " flags " << flags << " errno " << errno << " " << strerror(errno) << dendl;
+ return -errno;
+ }
+ ::flock(fd, LOCK_EX); // lock for safety
+
+ // seek
+ off_t actual = ::lseek(fd, offset, SEEK_SET);
+ int did = 0;
+ assert(actual == offset);
+
+ // write buffers
+ for (list<bufferptr>::const_iterator it = bl.buffers().begin();
+ it != bl.buffers().end();
+ it++) {
+ int r = ::write(fd, (char*)(*it).c_str(), (*it).length());
+ if (r > 0)
+ did += r;
+ else {
+ derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl;
+ }
+ }
+
+ if (did < 0) {
+ derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl;
+ }
+
+ ::flock(fd, LOCK_UN);
+
+ // schedule sync
+ if (did >= 0)
+ journal_write(oid, offset, len, bl, onsafe);
+ else
+ delete onsafe;
+
+ ::close(fd);
+
+ return did;
+}
+
+int FileStore::clone(pobject_t oldoid, pobject_t newoid)
+{
+ char ofn[200], nfn[200];
+ get_oname(oldoid, ofn);
+ get_oname(newoid, nfn);
+
+ dout(20) << "clone " << ofn << " -> " << nfn << dendl;
+
+ int o = ::open(ofn, O_RDONLY);
+ if (o < 0)
+ return -errno;
+ int n = ::open(nfn, O_CREAT|O_TRUNC|O_WRONLY, 0644);
+ if (n < 0)
+ return -errno;
+ int r = 0;
+ if (btrfs_fd >= 0)
+ r = ::ioctl(n, BTRFS_IOC_CLONE, o);
+ else {
+ struct stat st;
+ ::fstat(o, &st);
+
+#ifdef SPLICE_F_MOVE
+ loff_t op = 0, np = 0;
+ while (op < st.st_size && r >= 0)
+ r = ::splice(o, &op, n, &np, st.st_size-op, 0);
+#else
+ loff_t pos = 0;
+ int buflen = 4096*10;
+ char buf[buflen];
+ while (pos < st.st_size) {
+ int l = MIN(st.st_size-pos, buflen);
+ r = ::read(o, buf, l);
+ if (r < 0)
+ break;
+ int op = 0;
+ while (op < l) {
+ int r2 = ::write(n, buf+op, l-op);
+
+ if (r2 < 0) { r = r2; break; }
+ op += r2;
+ }
+ if (r < 0) break;
+ pos += r;
+ }
+#endif
+ }
+ if (r < 0)
+ return -errno;
+
+ ::close(n);
+ ::close(o);
+ return 0;
+}
+
+
+void FileStore::sync_entry()
+{
+ lock.Lock();
+ utime_t interval;
+ interval.set_from_double(g_conf.filestore_sync_interval);
+ while (!stop) {
+ dout(20) << "sync_entry waiting for " << interval << dendl;
+ sync_cond.WaitInterval(lock, interval);
+ lock.Unlock();
+
+ dout(20) << "sync_entry committing " << super_epoch << " " << interval << dendl;
+ commit_start();
+
+ // induce an fs sync.
+ // we assume data=ordered or similar semantics
+ char fn[100];
+ sprintf(fn, "%s/commit_epoch", basedir.c_str());
+ int fd = ::open(fn, O_CREAT|O_WRONLY, 0644);
+ ::write(fd, &super_epoch, sizeof(super_epoch));
+ ::fsync(fd); // this should cause the fs's journal to commit. (on btrfs too.)
+ ::close(fd);
+
+ commit_finish();
+
+ lock.Lock();
+ dout(20) << "sync_entry committed " << super_epoch << dendl;
+ }
+ lock.Unlock();
+}
+
+void FileStore::sync()
+{
+ Mutex::Locker l(lock);
+ sync_cond.Signal();
+}
+
+void FileStore::sync(Context *onsafe)
+{
+ journal_sync(onsafe);
+ sync();
+}
+
+
+// -------------------------------
+// attributes
+
+// objects
+
+int FileStore::setattr(pobject_t oid, const char *name,
+ const void *value, size_t size,
+ Context *onsafe)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.setattr(oid, name, value, size, onsafe);
+ else {
+ char fn[100];
+ get_oname(oid, fn);
+ r = do_setxattr(fn, name, value, size);
+ }
+ if (r >= 0)
+ journal_setattr(oid, name, value, size, onsafe);
+ else
+ delete onsafe;
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::setattrs(pobject_t oid, map<string,bufferptr>& aset)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.setattrs(oid, aset);
+ else {
+ char fn[100];
+ get_oname(oid, fn);
+ r = 0;
+ for (map<string,bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ r = do_setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length());
+ if (r < 0) {
+ cerr << "error setxattr " << strerror(errno) << std::endl;
+ break;
+ }
+ }
+ }
+ if (r >= 0)
+ journal_setattrs(oid, aset, 0);
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::getattr(pobject_t oid, const char *name,
+ void *value, size_t size)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.getattr(oid, name, value, size);
+ else {
+ char fn[100];
+ get_oname(oid, fn);
+ r = do_getxattr(fn, name, value, size);
+ }
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::getattrs(pobject_t oid, map<string,bufferptr>& aset)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.getattrs(oid, aset);
+ else {
+ char fn[100];
+ get_oname(oid, fn);
+
+ char val[1000];
+ char names[1000];
+ int num = do_listxattr(fn, names, 1000);
+
+ char *name = names;
+ for (int i=0; i<num; i++) {
+ dout(0) << "getattrs " << oid << " getting " << (i+1) << "/" << num << " '" << names << "'" << dendl;
+ int l = do_getxattr(fn, name, val, 1000);
+ dout(0) << "getattrs " << oid << " getting " << (i+1) << "/" << num << " '" << names << "' = " << l << " bytes" << dendl;
+ aset[names].append(val, l);
+ name += strlen(name) + 1;
+ }
+ }
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::rmattr(pobject_t oid, const char *name, Context *onsafe)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.rmattr(oid, name, onsafe);
+ else {
+ char fn[100];
+ get_oname(oid, fn);
+ r = do_removexattr(fn, name);
+ }
+ if (r >= 0)
+ journal_rmattr(oid, name, onsafe);
+ else
+ delete onsafe;
+ return r < 0 ? -errno:r;
+}
+
+
+
+// collections
+
+int FileStore::collection_setattr(coll_t c, const char *name,
+ void *value, size_t size,
+ Context *onsafe)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.collection_setattr(c, name, value, size, onsafe);
+ else {
+ char fn[200];
+ get_cdir(c, fn);
+ r = do_setxattr(fn, name, value, size);
+ }
+ if (r >= 0)
+ journal_collection_setattr(c, name, value, size, onsafe);
+ else
+ delete onsafe;
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::collection_rmattr(coll_t c, const char *name,
+ Context *onsafe)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.collection_rmattr(c, name, onsafe);
+ else {
+ char fn[200];
+ get_cdir(c, fn);
+ r = do_removexattr(fn, name);
+ }
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::collection_getattr(coll_t c, const char *name,
+ void *value, size_t size)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.collection_getattr(c, name, value, size);
+ else {
+ char fn[200];
+ get_cdir(c, fn);
+ r = do_getxattr(fn, name, value, size);
+ }
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::collection_setattrs(coll_t cid, map<string,bufferptr>& aset)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.collection_setattrs(cid, aset);
+ else {
+ char fn[100];
+ get_cdir(cid, fn);
+ int r = 0;
+ for (map<string,bufferptr>::iterator p = aset.begin();
+ p != aset.end();
+ ++p) {
+ r = do_setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length());
+ if (r < 0) break;
+ }
+ }
+ if (r >= 0)
+ journal_collection_setattrs(cid, aset, 0);
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::collection_getattrs(coll_t cid, map<string,bufferptr>& aset)
+{
+ int r;
+ if (fake_attrs)
+ r = attrs.collection_getattrs(cid, aset);
+ else {
+ char fn[100];
+ get_cdir(cid, fn);
+
+ char val[1000];
+ char names[1000];
+ int num = do_listxattr(fn, names, 1000);
+
+ char *name = names;
+ for (int i=0; i<num; i++) {
+ dout(0) << "getattrs " << cid << " getting " << (i+1) << "/" << num << " '" << names << "'" << dendl;
+ int l = do_getxattr(fn, name, val, 1000);
+ dout(0) << "getattrs " << cid << " getting " << (i+1) << "/" << num << " '" << names << "' = " << l << " bytes" << dendl;
+ aset[names].append(val, l);
+ name += strlen(name) + 1;
+ }
+ r = 0;
+ }
+ return r < 0 ? -errno:r;
+}
+
+
+/*
+int FileStore::collection_listattr(coll_t c, char *attrs, size_t size)
+{
+ if (fake_attrs) return collection_listattr(c, attrs, size);
+ return 0;
+}
+*/
+
+
+int FileStore::list_objects(list<pobject_t>& ls)
+{
+ char fn[200];
+ sprintf(fn, "%s/objects", basedir.c_str());
+
+ DIR *dir = ::opendir(fn);
+ assert(dir);
+
+ struct dirent *de;
+ while ((de = ::readdir(dir)) != 0) {
+ if (de->d_name[0] == '.') continue;
+ // parse
+ pobject_t o = parse_object(de->d_name);
+ if (errno) continue;
+ ls.push_back(o);
+ }
+
+ ::closedir(dir);
+ return 0;
+}
+
+
+// --------------------------
+// collections
+
+int FileStore::list_collections(list<coll_t>& ls)
+{
+ if (fake_collections) return collections.list_collections(ls);
+
+ char fn[200];
+ sprintf(fn, "%s/collections", basedir.c_str());
+
+ DIR *dir = ::opendir(fn);
+ assert(dir);
+
+ struct dirent *de;
+ while ((de = ::readdir(dir)) != 0) {
+ // parse
+ errno = 0;
+ coll_t c = parse_coll(de->d_name);
+ if (c) ls.push_back(c);
+ }
+
+ ::closedir(dir);
+ return 0;
+}
+
+int FileStore::create_collection(coll_t c,
+ Context *onsafe)
+{
+ if (fake_collections) return collections.create_collection(c, onsafe);
+
+ char fn[200];
+ get_cdir(c, fn);
+
+ int r = ::mkdir(fn, 0755);
+
+ if (r >= 0)
+ journal_create_collection(c, onsafe);
+ else
+ delete onsafe;
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::destroy_collection(coll_t c,
+ Context *onsafe)
+{
+ if (fake_collections) return collections.destroy_collection(c, onsafe);
+
+ char fn[200];
+ get_cdir(c, fn);
+ char cmd[200];
+ sprintf(cmd, "test -d %s && rm -r %s", fn, fn);
+ system(cmd);
+ int r = 0; // fixme
+
+ if (r >= 0)
+ journal_destroy_collection(c, onsafe);
+ else
+ delete onsafe;
+ return 0;
+}
+
+int FileStore::collection_stat(coll_t c, struct stat *st)
+{
+ if (fake_collections) return collections.collection_stat(c, st);
+
+ char fn[200];
+ get_cdir(c, fn);
+ int r = ::lstat(fn, st);
+ return r < 0 ? -errno:r;
+}
+
+bool FileStore::collection_exists(coll_t c)
+{
+ if (fake_collections) return collections.collection_exists(c);
+
+ struct stat st;
+ return collection_stat(c, &st) == 0;
+}
+
+
+int FileStore::collection_add(coll_t c, pobject_t o,
+ Context *onsafe)
+{
+ int r;
+ if (fake_collections)
+ r = collections.collection_add(c, o, onsafe);
+ else {
+ char cof[200];
+ get_coname(c, o, cof);
+ char of[200];
+ get_oname(o, of);
+ r = ::link(of, cof);
+ }
+ if (r >= 0)
+ journal_collection_add(c, o, onsafe);
+ else
+ delete onsafe;
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::collection_remove(coll_t c, pobject_t o,
+ Context *onsafe)
+{
+ int r;
+ if (fake_collections)
+ r = collections.collection_remove(c, o, onsafe);
+ else {
+ char cof[200];
+ get_coname(c, o, cof);
+ r = ::unlink(cof);
+ }
+ if (r >= 0)
+ journal_collection_remove(c, o, onsafe);
+ else
+ delete onsafe;
+ return r < 0 ? -errno:r;
+}
+
+int FileStore::collection_list(coll_t c, list<pobject_t>& ls)
+{
+ if (fake_collections) return collections.collection_list(c, ls);
+
+ char fn[200];
+ get_cdir(c, fn);
+
+ DIR *dir = ::opendir(fn);
+ assert(dir);
+
+ struct dirent *de;
+ while ((de = ::readdir(dir)) != 0) {
+ // parse
+ if (de->d_name[0] == '.') continue;
+ //cout << " got object " << de->d_name << std::endl;
+ pobject_t o = parse_object(de->d_name);
+ if (errno) continue;
+ ls.push_back(o);
+ }
+
+ ::closedir(dir);
+ return 0;
+}
+
+// eof.
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __FILESTORE_H
+#define __FILESTORE_H
+
+#include "ObjectStore.h"
+#include "JournalingObjectStore.h"
+#include "common/ThreadPool.h"
+#include "common/Mutex.h"
+
+#include "Fake.h"
+//#include "FakeStoreBDBCollections.h"
+
+
+#include <map>
+using namespace std;
+
+#include <ext/hash_map>
+using namespace __gnu_cxx;
+
+
+// fake attributes in memory, if we need to.
+
+class FileStore : public JournalingObjectStore {
+ string basedir;
+ __u64 fsid;
+
+ int btrfs_fd; // >= if btrfs
+
+ // fake attrs?
+ FakeAttrs attrs;
+ bool fake_attrs;
+
+ // fake collections?
+ FakeCollections collections;
+ bool fake_collections;
+
+ // helper fns
+ void get_oname(pobject_t oid, char *s);
+ void get_cdir(coll_t cid, char *s);
+ void get_coname(coll_t cid, pobject_t oid, char *s);
+ pobject_t parse_object(char *s);
+ coll_t parse_coll(char *s);
+
+ // sync thread
+ Mutex lock;
+ Cond sync_cond;
+ bool stop;
+ void sync_entry();
+ struct SyncThread : public Thread {
+ FileStore *fs;
+ SyncThread(FileStore *f) : fs(f) {}
+ void *entry() {
+ fs->sync_entry();
+ return 0;
+ }
+ } sync_thread;
+
+ void sync_fs(); // actuall sync underlying fs
+
+ public:
+ FileStore(const char *base) :
+ basedir(base),
+ btrfs_fd(-1),
+ attrs(this), fake_attrs(false),
+ collections(this), fake_collections(false),
+ stop(false), sync_thread(this) { }
+
+ int mount();
+ int umount();
+ int mkfs();
+
+ int transaction_start();
+ void transaction_end(int id);
+
+ int statfs(struct statfs *buf);
+
+ // ------------------
+ // objects
+ int pick_object_revision_lt(pobject_t& oid) {
+ return 0;
+ }
+ bool exists(pobject_t oid);
+ int stat(pobject_t oid, struct stat *st);
+ int remove(pobject_t oid, Context *onsafe);
+ int truncate(pobject_t oid, off_t size, Context *onsafe);
+ int read(pobject_t oid, off_t offset, size_t len, bufferlist& bl);
+ int write(pobject_t oid, off_t offset, size_t len, const bufferlist& bl, Context *onsafe);
+ int clone(pobject_t oldoid, pobject_t newoid);
+
+ void sync();
+ void sync(Context *onsafe);
+
+ int list_objects(list<pobject_t>& ls);
+
+ // attrs
+ int setattr(pobject_t oid, const char *name, const void *value, size_t size, Context *onsafe=0);
+ int setattrs(pobject_t oid, map<string,bufferptr>& aset);
+ int getattr(pobject_t oid, const char *name, void *value, size_t size);
+ int getattrs(pobject_t oid, map<string,bufferptr>& aset);
+ int rmattr(pobject_t oid, const char *name, Context *onsafe=0);
+ //int listattr(pobject_t oid, char *attrs, size_t size);
+ int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0);
+ int collection_rmattr(coll_t c, const char *name, Context *onsafe=0);
+ int collection_getattr(coll_t c, const char *name, void *value, size_t size);
+ //int collection_listattr(coll_t c, char *attrs, size_t size);
+ int collection_getattrs(coll_t cid, map<string,bufferptr> &aset);
+ int collection_setattrs(coll_t cid, map<string,bufferptr> &aset);
+
+ // collections
+ int list_collections(list<coll_t>& ls);
+ int create_collection(coll_t c, Context *onsafe=0);
+ int destroy_collection(coll_t c, Context *onsafe=0);
+ int collection_stat(coll_t c, struct stat *st);
+ bool collection_exists(coll_t c);
+ int collection_add(coll_t c, pobject_t o, Context *onsafe=0);
+ int collection_remove(coll_t c, pobject_t o, Context *onsafe=0);
+ int collection_list(coll_t c, list<pobject_t>& o);
+
+
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __EBOFS_JOURNAL_H
+#define __EBOFS_JOURNAL_H
+
+#include "include/buffer.h"
+#include "include/Context.h"
+#include "common/Finisher.h"
+
+class Journal {
+protected:
+ __u64 fsid;
+ Finisher *finisher;
+
+public:
+ Journal(__u64 f, Finisher *fin) : fsid(f), finisher(fin) { }
+ virtual ~Journal() { }
+
+ virtual int create() = 0;
+ virtual int open(epoch_t epoch) = 0;
+ virtual void close() = 0;
+
+ // writes
+ virtual bool is_writeable() = 0;
+ virtual void make_writeable() = 0;
+ virtual void submit_entry(epoch_t epoch, bufferlist& e, Context *oncommit) = 0;
+ virtual void commit_epoch_start(epoch_t) = 0; // mark epoch boundary
+ virtual void commit_epoch_finish(epoch_t) = 0; // mark prior epoch as committed (we can expire)
+ virtual bool read_entry(bufferlist& bl, epoch_t &e) = 0;
+ virtual bool is_full() = 0;
+
+ // reads/recovery
+
+};
+
+#endif
--- /dev/null
+
+#include "JournalingObjectStore.h"
+
+#include "config.h"
+
+#define dout(x) if (x <= g_conf.debug) *_dout << dbeginl << g_clock.now() << " journal "
+#define derr(x) if (x <= g_conf.debug) *_derr << dbeginl << g_clock.now() << " journal "
+
+int JournalingObjectStore::journal_replay()
+{
+ if (!journal)
+ return 0;
+
+ int err = journal->open(super_epoch);
+ if (err < 0) {
+ dout(3) << "journal_replay open failed with" << err
+ << " " << strerror(err) << dendl;
+ delete journal;
+ journal = 0;
+ return err;
+ }
+
+ int count = 0;
+ while (1) {
+ bufferlist bl;
+ epoch_t e;
+ if (!journal->read_entry(bl, e)) {
+ dout(3) << "journal_replay: end of journal, done." << dendl;
+ break;
+ }
+
+ if (e < super_epoch) {
+ dout(3) << "journal_replay: skipping old entry in epoch " << e << " < " << super_epoch << dendl;
+ continue;
+ }
+ if (e == super_epoch+1) {
+ super_epoch++;
+ dout(3) << "journal_replay: jumped to next epoch " << super_epoch << dendl;
+ }
+ assert(e == super_epoch);
+
+ dout(3) << "journal_replay: applying transaction in epoch " << e << dendl;
+ Transaction t(bl);
+ apply_transaction(t);
+ count++;
+ }
+
+ // done reading, make writeable.
+ journal->make_writeable();
+
+ return count;
+}
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+#ifndef __JOURNALINGOBJECTSTORE_H
+#define __JOURNALINGOBJECTSTORE_H
+
+#include "ObjectStore.h"
+#include "Journal.h"
+
+class JournalingObjectStore : public ObjectStore {
+protected:
+ epoch_t super_epoch;
+ Journal *journal;
+ Finisher finisher;
+ map<version_t, list<Context*> > commit_waiters;
+
+ void journal_start() {
+ finisher.start();
+ }
+ void journal_stop() {
+ finisher.stop();
+ }
+ int journal_replay();
+
+ void commit_start() {
+ super_epoch++;
+ if (journal)
+ journal->commit_epoch_start(super_epoch);
+ }
+ void commit_finish() {
+ finisher.queue(commit_waiters[super_epoch-1]);
+ if (journal)
+ journal->commit_epoch_finish(super_epoch);
+ }
+
+ void queue_commit_waiter(Context *oncommit) {
+ if (oncommit)
+ commit_waiters[super_epoch].push_back(oncommit);
+ }
+
+ void journal_write(pobject_t oid, off_t off, size_t len, const bufferlist& bl, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.write(oid, off, len, bl);
+ bufferlist tbl;
+ t.encode(tbl);
+ journal->submit_entry(super_epoch, tbl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_zero(pobject_t oid, off_t off, size_t len, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.zero(oid, off, len);
+ bufferlist tbl;
+ t.encode(tbl);
+ journal->submit_entry(super_epoch, tbl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_remove(pobject_t oid, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.remove(oid);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_truncate(pobject_t oid, off_t size, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.truncate(oid, size);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_clone(pobject_t from, pobject_t to, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.clone(from, to);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_setattr(pobject_t oid, const char *name, const void *value, size_t size, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.setattr(oid, name, value, size);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_setattrs(pobject_t oid, map<string,bufferptr>& attrset, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.setattrs(oid, attrset);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_rmattr(pobject_t oid, const char *name, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.rmattr(oid, name);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_create_collection(coll_t cid, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.create_collection(cid);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_destroy_collection(coll_t cid, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.remove_collection(cid);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_collection_add(coll_t cid, pobject_t oid, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.collection_add(cid, oid);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_collection_remove(coll_t cid, pobject_t oid, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.collection_remove(cid, oid);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.collection_setattr(cid, name, value, size);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_collection_setattrs(coll_t cid, map<string,bufferptr>& aset, Context *onsafe) {
+ if (journal && journal->is_writeable()) {
+ Transaction t;
+ t.collection_setattrs(cid, aset);
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+ void journal_sync(Context *onsafe) {
+ if (journal) {
+ // journal empty transaction
+ Transaction t;
+ bufferlist bl;
+ t.encode(bl);
+ journal->submit_entry(super_epoch, bl, onsafe);
+ } else
+ queue_commit_waiter(onsafe);
+ }
+
+public:
+ JournalingObjectStore() : super_epoch(0), journal(0) { }
+
+};
+
+#endif
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+
+#include "ObjectStore.h"
+
+#include "config.h"
+#include "common/Clock.h"
+
+#define dout(x) if (x < g_conf.debug) *_dout << dbeginl << g_clock.now() << " ager: "
+
+object_t ObjectStore::age_get_oid() {
+ if (!age_free_oids.empty()) {
+ object_t o = age_free_oids.front();
+ age_free_oids.pop_front();
+ return o;
+ }
+ return age_cur_oid++;
+ }
+
+ ssize_t ObjectStore::age_pick_size() {
+ ssize_t max = file_size_distn.sample() * 1024;
+ return max/2 + (rand() % 100) * max/200 + 1;
+ }
+
+ void ObjectStore::age_fill(float pc, utime_t until) {
+ bufferptr bp(1024*1024);
+ bp.zero();
+ bufferlist bl;
+ bl.push_back(bp);
+ while (1) {
+ if (g_clock.now() > until) break;
+
+ struct statfs st;
+ statfs(&st);
+ float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
+ if (a >= pc) {
+ dout(10) << "age_fill at " << a << " / " << pc << " stopping" << dendl;
+ break;
+ }
+
+ object_t oid = age_get_oid();
+
+ int b = rand() % 10;
+ age_objects[b].push_back(oid);
+
+ ssize_t s = age_pick_size();
+
+ dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl;
+
+ off_t off = 0;
+ while (s) {
+ ssize_t t = MIN(s, 1024*1024);
+ write(oid, t, off, bl, false);
+ off += t;
+ s -= t;
+ }
+ oid++;
+ }
+ }
+
+ void ObjectStore::age_empty(float pc) {
+ int nper = 20;
+ int n = nper;
+ while (1) {
+ struct statfs st;
+ statfs(&st);
+ float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
+ if (a <= pc) {
+ dout(10) << "age_empty at " << a << " / " << pc << " stopping" << dendl;
+ break;
+ }
+
+ int b = rand() % 10;
+ n--;
+ if (n == 0 || age_objects[b].empty()) {
+ dout(10) << "age_empty sync" << dendl;
+ //sync();
+ sync();
+ n = nper;
+ continue;
+ }
+ object_t oid = age_objects[b].front();
+ age_objects[b].pop_front();
+
+ dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << dendl;
+
+ remove(oid);
+ age_free_oids.push_back(oid);
+ }
+ }
+
+
+ void ObjectStore::age(int time,
+ float high_water, // fill to this %
+ float low_water, // then empty to this %
+ int count, // this many times
+ float final_water, // and end here ( <= low_water)
+ int fake_size_mb) {
+ utime_t until = g_clock.now();
+ until.sec_ref() += time;
+
+ while (age_objects.size() < 10) age_objects.push_back( list<object_t>() );
+
+ if (fake_size_mb) {
+ int fake_bl = fake_size_mb * 256;
+ struct statfs st;
+ statfs(&st);
+ float f = (float)fake_bl / (float)st.f_blocks;
+ high_water = (float)high_water * f;
+ low_water = (float)low_water * f;
+ final_water = (float)final_water * f;
+ dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl;
+ }
+
+ // init size distn (once)
+ if (!did_distn) {
+ did_distn = true;
+ age_cur_oid = 1;
+ file_size_distn.add(1, 19.0758125+0.65434375);
+ file_size_distn.add(512, 35.6566);
+ file_size_distn.add(1024, 27.7271875);
+ file_size_distn.add(2*1024, 16.63503125);
+ //file_size_distn.add(4*1024, 106.82384375);
+ //file_size_distn.add(8*1024, 81.493375);
+ //file_size_distn.add(16*1024, 14.13553125);
+ //file_size_distn.add(32*1024, 2.176);
+ //file_size_distn.add(256*1024, 0.655938);
+ //file_size_distn.add(512*1024, 0.1480625);
+ //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
+ file_size_distn.normalize();
+ }
+
+ // clear
+ for (int i=0; i<10; i++)
+ age_objects[i].clear();
+
+ for (int c=1; c<=count; c++) {
+ if (g_clock.now() > until) break;
+
+ dout(1) << "age " << c << "/" << count << " filling to " << high_water << dendl;
+ age_fill(high_water, until);
+ if (c == count) {
+ dout(1) << "age final empty to " << final_water << dendl;
+ age_empty(final_water);
+ } else {
+ dout(1) << "age " << c << "/" << count << " emptying to " << low_water << dendl;
+ age_empty(low_water);
+ }
+ }
+ dout(1) << "age finished" << dendl;
+ }
+
--- /dev/null
+// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
+// vim: ts=8 sw=2 smarttab
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation. See file COPYING.
+ *
+ */
+
+
+#ifndef __OBJECTSTORE_H
+#define __OBJECTSTORE_H
+
+#include "include/types.h"
+#include "include/Context.h"
+#include "include/buffer.h"
+#include "include/pobject.h"
+
+#include "include/Distribution.h"
+
+#include <sys/stat.h>
+
+#ifdef DARWIN
+#include <sys/statvfs.h>
+#else
+#include <sys/vfs.h> /* or <sys/statfs.h> */
+#endif /* DARWIN */
+
+#include <list>
+using std::list;
+
+#ifndef MIN
+# define MIN(a,b) ((a) < (b) ? (a):(b))
+#endif
+
+/*
+ * low-level interface to the local OSD file system
+ */
+
+
+
+class ObjectStore {
+public:
+
+
+ class FragmentationStat {
+ public:
+ int total;
+ int num_extent;
+ int avg_extent;
+ map<int,int> extent_dist; // powers of two
+ map<int,int> extent_dist_sum; // powers of two
+
+ float avg_extent_per_object;
+ int avg_extent_jump; // avg distance bweteen consecutive extents
+
+ int total_free;
+ int num_free_extent;
+ int avg_free_extent;
+ map<int,int> free_extent_dist; // powers of two
+ map<int,int> free_extent_dist_sum; // powers of two
+ };
+
+
+
+ /*********************************
+ * transaction
+ */
+ class Transaction {
+ public:
+ static const int OP_READ = 1; // oid, offset, len, pbl
+ static const int OP_STAT = 2; // oid, pstat
+ static const int OP_GETATTR = 3; // oid, attrname, pattrval
+ static const int OP_GETATTRS = 4; // oid, pattrset
+
+ static const int OP_WRITE = 10; // oid, offset, len, bl
+ static const int OP_ZERO = 11; // oid, offset, len
+ static const int OP_TRUNCATE = 12; // oid, len
+ static const int OP_REMOVE = 13; // oid
+ static const int OP_SETATTR = 14; // oid, attrname, attrval
+ static const int OP_SETATTRS = 15; // oid, attrset
+ static const int OP_RMATTR = 16; // oid, attrname
+ static const int OP_CLONE = 17; // oid, newoid
+
+ static const int OP_TRIMCACHE = 18; // oid, offset, len
+
+ static const int OP_MKCOLL = 20; // cid
+ static const int OP_RMCOLL = 21; // cid
+ static const int OP_COLL_ADD = 22; // cid, oid
+ static const int OP_COLL_REMOVE = 23; // cid, oid
+ static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval
+ static const int OP_COLL_RMATTR = 25; // cid, attrname
+ static const int OP_COLL_SETATTRS = 26; // cid, attrset
+
+ private:
+ list<int8_t> ops;
+ list<bufferlist> bls;
+ list<pobject_t> oids;
+ list<coll_t> cids;
+ list<int64_t> lengths;
+ list<const char*> attrnames;
+ list<string> attrnames2;
+
+ // for reads only (not encoded)
+ list<bufferlist*> pbls;
+ list<struct stat*> psts;
+ list< pair<void*,int*> > pattrvals;
+ list< map<string,bufferptr>* > pattrsets;
+
+ public:
+ bool have_op() {
+ return !ops.empty();
+ }
+ int get_num_ops() { return ops.size(); }
+ int get_op() {
+ int op = ops.front();
+ ops.pop_front();
+ return op;
+ }
+ void get_bl(bufferlist& bl) {
+ bl.claim(bls.front());
+ bls.pop_front();
+ }
+ void get_oid(pobject_t& oid) {
+ oid = oids.front();
+ oids.pop_front();
+ }
+ void get_cid(coll_t& cid) {
+ cid = cids.front();
+ cids.pop_front();
+ }
+ void get_length(off_t& len) {
+ len = lengths.front();
+ lengths.pop_front();
+ }
+ void get_attrname(const char * &p) {
+ p = attrnames.front();
+ attrnames.pop_front();
+ }
+ void get_pbl(bufferlist* &pbl) {
+ pbl = pbls.front();
+ pbls.pop_front();
+ }
+ void get_pstat(struct stat* &pst) {
+ pst = psts.front();
+ psts.pop_front();
+ }
+ void get_pattrval(pair<void*,int*>& p) {
+ p = pattrvals.front();
+ pattrvals.pop_front();
+ }
+ void get_pattrset(map<string,bufferptr>* &ps) {
+ ps = pattrsets.front();
+ pattrsets.pop_front();
+ }
+
+
+ void read(pobject_t oid, off_t off, size_t len, bufferlist *pbl) {
+ int op = OP_READ;
+ ops.push_back(op);
+ oids.push_back(oid);
+ lengths.push_back(off);
+ lengths.push_back(len);
+ pbls.push_back(pbl);
+ }
+ void stat(pobject_t oid, struct stat *st) {
+ int op = OP_STAT;
+ ops.push_back(op);
+ oids.push_back(oid);
+ psts.push_back(st);
+ }
+ void getattr(pobject_t oid, const char* name, void* val, int *plen) {
+ int op = OP_GETATTR;
+ ops.push_back(op);
+ oids.push_back(oid);
+ attrnames.push_back(name);
+ pattrvals.push_back(pair<void*,int*>(val,plen));
+ }
+ void getattrs(pobject_t oid, map<string,bufferptr>& aset) {
+ int op = OP_GETATTRS;
+ ops.push_back(op);
+ oids.push_back(oid);
+ pattrsets.push_back(&aset);
+ }
+
+ void write(pobject_t oid, off_t off, size_t len, const bufferlist& bl) {
+ int op = OP_WRITE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ lengths.push_back(off);
+ lengths.push_back(len);
+ bls.push_back(bl);
+ }
+ void zero(pobject_t oid, off_t off, size_t len) {
+ int op = OP_ZERO;
+ ops.push_back(op);
+ oids.push_back(oid);
+ lengths.push_back(off);
+ lengths.push_back(len);
+ }
+ void trim_from_cache(pobject_t oid, off_t off, size_t len) {
+ int op = OP_TRIMCACHE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ lengths.push_back(off);
+ lengths.push_back(len);
+ }
+ void truncate(pobject_t oid, off_t off) {
+ int op = OP_TRUNCATE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ lengths.push_back(off);
+ }
+ void remove(pobject_t oid) {
+ int op = OP_REMOVE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ }
+ void setattr(pobject_t oid, const char* name, const void* val, int len) {
+ int op = OP_SETATTR;
+ ops.push_back(op);
+ oids.push_back(oid);
+ attrnames.push_back(name);
+ //attrvals.push_back(pair<const void*,int>(val,len));
+ bufferlist bl;
+ bl.append((char*)val,len);
+ bls.push_back(bl);
+ }
+ void setattrs(pobject_t oid, map<string,bufferptr>& attrset) {
+ int op = OP_SETATTRS;
+ ops.push_back(op);
+ oids.push_back(oid);
+ pattrsets.push_back(&attrset);
+ }
+ void rmattr(pobject_t oid, const char* name) {
+ int op = OP_RMATTR;
+ ops.push_back(op);
+ oids.push_back(oid);
+ attrnames.push_back(name);
+ }
+ void clone(pobject_t oid, pobject_t noid) {
+ int op = OP_CLONE;
+ ops.push_back(op);
+ oids.push_back(oid);
+ oids.push_back(noid);
+ }
+ void create_collection(coll_t cid) {
+ int op = OP_MKCOLL;
+ ops.push_back(op);
+ cids.push_back(cid);
+ }
+ void remove_collection(coll_t cid) {
+ int op = OP_RMCOLL;
+ ops.push_back(op);
+ cids.push_back(cid);
+ }
+ void collection_add(coll_t cid, pobject_t oid) {
+ int op = OP_COLL_ADD;
+ ops.push_back(op);
+ cids.push_back(cid);
+ oids.push_back(oid);
+ }
+ void collection_remove(coll_t cid, pobject_t oid) {
+ int op = OP_COLL_REMOVE;
+ ops.push_back(op);
+ cids.push_back(cid);
+ oids.push_back(oid);
+ }
+ void collection_setattr(coll_t cid, const char* name, const void* val, int len) {
+ int op = OP_COLL_SETATTR;
+ ops.push_back(op);
+ cids.push_back(cid);
+ attrnames.push_back(name);
+ bufferlist bl;
+ bl.append((char*)val, len);
+ bls.push_back(bl);
+ }
+ void collection_rmattr(coll_t cid, const char* name) {
+ int op = OP_COLL_RMATTR;
+ ops.push_back(op);
+ cids.push_back(cid);
+ attrnames.push_back(name);
+ }
+ void collection_setattrs(coll_t cid, map<string,bufferptr>& aset) {
+ int op = OP_COLL_SETATTRS;
+ ops.push_back(op);
+ cids.push_back(cid);
+ pattrsets.push_back(&aset);
+ }
+
+ // etc.
+ Transaction() {}
+ Transaction(bufferlist::iterator &p) { decode(p); }
+ Transaction(bufferlist &bl) {
+ bufferlist::iterator p = bl.begin();
+ decode(p);
+ }
+
+ void encode(bufferlist& bl) const {
+ ::encode(ops, bl);
+ ::encode(bls, bl);
+ ::encode(oids, bl);
+ ::encode(cids, bl);
+ ::encode(lengths, bl);
+ ::encode(attrnames, bl);
+ }
+ void decode(bufferlist::iterator &bl) {
+ ::decode(ops, bl);
+ ::decode(bls, bl);
+ ::decode(oids, bl);
+ ::decode(cids, bl);
+ ::decode(lengths, bl);
+ ::decode(attrnames2, bl);
+ for (list<string>::iterator p = attrnames2.begin();
+ p != attrnames2.end();
+ ++p)
+ attrnames.push_back((*p).c_str());
+ }
+ };
+
+ /*
+ * these stubs should be implemented if we want to use the
+ * apply_transaction() below and we want atomic transactions.
+ */
+ virtual int transaction_start() { return 0; }
+ virtual void transaction_end(int id) { }
+ virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) {
+ // non-atomic implementation
+ int id = transaction_start();
+ while (t.have_op()) {
+ int op = t.get_op();
+ switch (op) {
+ case Transaction::OP_READ:
+ {
+ pobject_t oid;
+ off_t offset, len;
+ t.get_oid(oid);
+ t.get_length(offset);
+ t.get_length(len);
+ bufferlist *pbl;
+ t.get_pbl(pbl);
+ read(oid, offset, len, *pbl);
+ }
+ break;
+ case Transaction::OP_STAT:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ struct stat *st;
+ t.get_pstat(st);
+ stat(oid, st);
+ }
+ break;
+ case Transaction::OP_GETATTR:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
+ pair<void*,int*> pattrval;
+ t.get_pattrval(pattrval);
+ *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second);
+ }
+ break;
+ case Transaction::OP_GETATTRS:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ map<string,bufferptr> *pset;
+ t.get_pattrset(pset);
+ getattrs(oid, *pset);
+ }
+ break;
+
+ case Transaction::OP_WRITE:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
+ bufferlist bl;
+ t.get_bl(bl);
+ write(oid, offset, len, bl, 0);
+ }
+ break;
+
+ case Transaction::OP_ZERO:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
+ zero(oid, offset, len, 0);
+ }
+ break;
+
+ case Transaction::OP_TRIMCACHE:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ off_t offset, len;
+ t.get_length(offset);
+ t.get_length(len);
+ trim_from_cache(oid, offset, len);
+ }
+ break;
+
+ case Transaction::OP_TRUNCATE:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ off_t len;
+ t.get_length(len);
+ truncate(oid, len, 0);
+ }
+ break;
+
+ case Transaction::OP_REMOVE:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ remove(oid, 0);
+ }
+ break;
+
+ case Transaction::OP_SETATTR:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
+ bufferlist bl;
+ t.get_bl(bl);
+ setattr(oid, attrname, bl.c_str(), bl.length(), 0);
+ }
+ break;
+ case Transaction::OP_SETATTRS:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ map<string,bufferptr> *pattrset;
+ t.get_pattrset(pattrset);
+ setattrs(oid, *pattrset, 0);
+ }
+ break;
+
+ case Transaction::OP_RMATTR:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ const char *attrname;
+ t.get_attrname(attrname);
+ rmattr(oid, attrname, 0);
+ }
+ break;
+
+ case Transaction::OP_CLONE:
+ {
+ pobject_t oid;
+ t.get_oid(oid);
+ pobject_t noid;
+ t.get_oid(noid);
+ clone(oid, noid);
+ }
+ break;
+
+ case Transaction::OP_MKCOLL:
+ {
+ coll_t cid;
+ t.get_cid(cid);
+ create_collection(cid, 0);
+ }
+ break;
+
+ case Transaction::OP_RMCOLL:
+ {
+ coll_t cid;
+ t.get_cid(cid);
+ destroy_collection(cid, 0);
+ }
+ break;
+
+ case Transaction::OP_COLL_ADD:
+ {
+ coll_t cid;
+ t.get_cid(cid);
+ pobject_t oid;
+ t.get_oid(oid);
+ collection_add(cid, oid, 0);
+ }
+ break;
+
+ case Transaction::OP_COLL_REMOVE:
+ {
+ coll_t cid;
+ t.get_cid(cid);
+ pobject_t oid;
+ t.get_oid(oid);
+ collection_remove(cid, oid, 0);
+ }
+ break;
+
+ case Transaction::OP_COLL_SETATTR:
+ {
+ coll_t cid;
+ t.get_cid(cid);
+ const char *attrname;
+ t.get_attrname(attrname);
+ bufferlist bl;
+ t.get_bl(bl);
+ collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0);
+ }
+ break;
+
+ case Transaction::OP_COLL_RMATTR:
+ {
+ coll_t cid;
+ t.get_cid(cid);
+ const char *attrname;
+ t.get_attrname(attrname);
+ collection_rmattr(cid, attrname, 0);
+ }
+ break;
+
+
+ default:
+ cerr << "bad op " << op << std::endl;
+ assert(0);
+ }
+ }
+ transaction_end(id);
+
+ if (onsafe) sync(onsafe);
+
+ return 0; // FIXME count errors
+ }
+
+ /*********************************************/
+
+
+
+ public:
+ ObjectStore() {}
+ virtual ~ObjectStore() {}
+
+ // mgmt
+ virtual int mount() = 0;
+ virtual int umount() = 0;
+ virtual int mkfs() = 0; // wipe
+
+ virtual int statfs(struct statfs *buf) = 0;
+
+ // objects
+ virtual int pick_object_revision_lt(pobject_t& oid) = 0;
+
+ virtual bool exists(pobject_t oid) = 0; // useful?
+ virtual int stat(pobject_t oid, struct stat *st) = 0; // struct stat?
+
+ virtual int remove(pobject_t oid,
+ Context *onsafe=0) = 0;
+
+ virtual int truncate(pobject_t oid, off_t size,
+ Context *onsafe=0) = 0;
+
+ virtual int read(pobject_t oid,
+ off_t offset, size_t len,
+ bufferlist& bl) = 0;
+ virtual int write(pobject_t oid,
+ off_t offset, size_t len,
+ const bufferlist& bl,
+ Context *onsafe) = 0;//{ return -1; }
+ virtual int zero(pobject_t oid,
+ off_t offset, size_t len,
+ Context *onsafe) {
+ // write zeros.. yuck!
+ bufferptr bp(len);
+ bufferlist bl;
+ bl.push_back(bp);
+ return write(oid, offset, len, bl, onsafe);
+ }
+ virtual void trim_from_cache(pobject_t oid,
+ off_t offset, size_t len) { }
+ virtual int is_cached(pobject_t oid,
+ off_t offset,
+ size_t len) { return -1; }
+
+ virtual int setattr(pobject_t oid, const char *name,
+ const void *value, size_t size,
+ Context *onsafe=0) {return 0;} //= 0;
+ virtual int setattrs(pobject_t oid, map<string,bufferptr>& aset,
+ Context *onsafe=0) {return 0;} //= 0;
+ virtual int getattr(pobject_t oid, const char *name,
+ void *value, size_t size) {return 0;} //= 0;
+ virtual int getattrs(pobject_t oid, map<string,bufferptr>& aset) {return 0;};
+
+ virtual int rmattr(pobject_t oid, const char *name,
+ Context *onsafe=0) {return 0;}
+
+ virtual int clone(pobject_t oid, pobject_t noid) {
+ return -1;
+ }
+
+ virtual int list_objects(list<pobject_t>& ls) = 0;//{ return -1; }
+
+ virtual int get_object_collections(pobject_t oid, set<coll_t>& ls) { return -1; }
+
+ //virtual int listattr(pobject_t oid, char *attrs, size_t size) {return 0;} //= 0;
+
+ // collections
+ virtual int list_collections(list<coll_t>& ls) {return 0;}//= 0;
+ virtual int create_collection(coll_t c,
+ Context *onsafe=0) {return 0;}//= 0;
+ virtual int destroy_collection(coll_t c,
+ Context *onsafe=0) {return 0;}//= 0;
+ virtual bool collection_exists(coll_t c) {return 0;}
+ virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0;
+ virtual int collection_add(coll_t c, pobject_t o,
+ Context *onsafe=0) {return 0;}//= 0;
+ virtual int collection_remove(coll_t c, pobject_t o,
+ Context *onsafe=0) {return 0;}// = 0;
+ virtual int collection_list(coll_t c, list<pobject_t>& o) {return 0;}//= 0;
+
+ virtual int collection_setattr(coll_t cid, const char *name,
+ const void *value, size_t size,
+ Context *onsafe=0) {return 0;} //= 0;
+ virtual int collection_rmattr(coll_t cid, const char *name,
+ Context *onsafe=0) {return 0;} //= 0;
+ virtual int collection_getattr(coll_t cid, const char *name,
+ void *value, size_t size) {return 0;} //= 0;
+
+ virtual int collection_getattrs(coll_t cid, map<string,bufferptr> &aset) = 0;//{ return -1; }
+ virtual int collection_setattrs(coll_t cid, map<string,bufferptr> &aset) = 0;//{ return -1; }
+
+
+ //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0;
+
+ virtual void sync(Context *onsync) {}
+ virtual void sync() {}
+
+
+ virtual void _fake_writes(bool b) {};
+
+ virtual void _get_frag_stat(FragmentationStat& st) {};
+
+};
+
+
+#endif
#include "include/types.h"
#include "Ager.h"
-#include "ObjectStore.h"
+#include "os/ObjectStore.h"
#include "config.h"
#include "common/Clock.h"
#include "include/types.h"
#include "include/Distribution.h"
-#include "ObjectStore.h"
+#include "os/ObjectStore.h"
#include "common/Clock.h"
#include <list>
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __BERKELEYDB_H
-#define __BERKELEYDB_H
-
-#include <db.h>
-#include <unistd.h>
-
-#include <list>
-using namespace std;
-
-
-template<typename K, typename D>
-class BDBMap {
- private:
- DB *dbp;
-
- public:
- BDBMap() : dbp(0) {}
- ~BDBMap() {
- close();
- }
-
- bool is_open() { return dbp ? true:false; }
-
- // open/close
- int open(const char *fn) {
- //cout << "open " << fn << endl;
-
- int r;
- if ((r = db_create(&dbp, NULL, 0)) != 0) {
- cerr << "db_create: " << db_strerror(r) << endl;
- assert(0);
- }
-
- dbp->set_errfile(dbp, stderr);
- dbp->set_errpfx(dbp, "bdbmap");
-
- r = dbp->open(dbp, NULL, fn, NULL, DB_BTREE, DB_CREATE, 0644);
- if (r != 0) {
- dbp->err(dbp, r, "%s", fn);
- }
- assert(r == 0);
- return 0;
- }
- void close() {
- if (dbp) {
- dbp->close(dbp,0);
- dbp = 0;
- }
- }
- void remove(const char *fn) {
- if (!dbp) open(fn);
- if (dbp) {
- dbp->remove(dbp, fn, 0, 0);
- dbp = 0;
- } else {
- ::unlink(fn);
- }
- }
-
- // accessors
- int put(K key,
- D data) {
- DBT k;
- memset(&k, 0, sizeof(k));
- k.data = &key;
- k.size = sizeof(K);
- DBT d;
- memset(&d, 0, sizeof(d));
- d.data = &data;
- d.size = sizeof(data);
- return dbp->put(dbp, NULL, &k, &d, 0);
- }
-
- int get(K key,
- D& data) {
- DBT k;
- memset(&k, 0, sizeof(k));
- k.data = &key;
- k.size = sizeof(key);
- DBT d;
- memset(&d, 0, sizeof(d));
- d.data = &data;
- d.size = sizeof(data);
- int r = dbp->get(dbp, NULL, &k, &d, 0);
- return r;
- }
-
- int del(K key) {
- DBT k;
- memset(&k, 0, sizeof(k));
- k.data = &key;
- k.size = sizeof(key);
- return dbp->del(dbp, NULL, &k, 0);
- }
-
- int list_keys(list<K>& ls) {
- DBC *cursor = 0;
- int r = dbp->cursor(dbp, NULL, &cursor, 0);
- assert(r == 0);
-
- DBT k,d;
- memset(&k, 0, sizeof(k));
- memset(&d, 0, sizeof(d));
-
- while ((r = cursor->c_get(cursor, &k, &d, DB_NEXT)) == 0) {
- K key;
- assert(k.size == sizeof(key));
- memcpy(&key, k.data, k.size);
- ls.push_back(key);
- }
- if (r != DB_NOTFOUND) {
- dbp->err(dbp, r, "DBcursor->get");
- assert(r == DB_NOTFOUND);
- }
-
- cursor->c_close(cursor);
- return 0;
- }
-
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __FAKE_H
-#define __FAKE_H
-
-#include "include/types.h"
-
-#include <list>
-#include <set>
-#include <ext/hash_map>
-using namespace std;
-using namespace __gnu_cxx;
-
-class FakeStoreCollections {
- private:
- Mutex faker_lock;
- ObjectStore *store;
- hash_map<coll_t, set<pobject_t> > fakecollections;
-
- public:
- FakeStoreCollections(ObjectStore *s) : store(s) {}
-
- // faked collections
- int list_collections(list<coll_t>& ls) {
- faker_lock.Lock();
- int r = 0;
- for (hash_map< coll_t, set<pobject_t> >::iterator p = fakecollections.begin();
- p != fakecollections.end();
- p++) {
- r++;
- ls.push_back(p->first);
- }
- faker_lock.Unlock();
- return r;
- }
-
- int create_collection(coll_t c,
- Context *onsafe=0) {
- faker_lock.Lock();
- fakecollections[c].size();
- if (onsafe) store->sync(onsafe);
- faker_lock.Unlock();
- return 0;
- }
-
- int destroy_collection(coll_t c,
- Context *onsafe=0) {
- int r = 0;
- faker_lock.Lock();
- if (fakecollections.count(c)) {
- fakecollections.erase(c);
- //fakecattr.erase(c);
- if (onsafe) store->sync(onsafe);
- } else
- r = -1;
- faker_lock.Unlock();
- return r;
- }
-
- int collection_stat(coll_t c, struct stat *st) {
- return collection_exists(c) ? 0:-1;
- }
-
- bool collection_exists(coll_t c) {
- faker_lock.Lock();
- int r = fakecollections.count(c);
- faker_lock.Unlock();
- return r;
- }
-
- int collection_add(coll_t c, pobject_t o,
- Context *onsafe=0) {
- faker_lock.Lock();
- fakecollections[c].insert(o);
- if (onsafe) store->sync(onsafe);
- faker_lock.Unlock();
- return 0;
- }
-
- int collection_remove(coll_t c, pobject_t o,
- Context *onsafe=0) {
- faker_lock.Lock();
- fakecollections[c].erase(o);
- if (onsafe) store->sync(onsafe);
- faker_lock.Unlock();
- return 0;
- }
-
- int collection_list(coll_t c, list<pobject_t>& o) {
- faker_lock.Lock();
- int r = 0;
- for (set<pobject_t>::iterator p = fakecollections[c].begin();
- p != fakecollections[c].end();
- p++) {
- o.push_back(*p);
- r++;
- }
- faker_lock.Unlock();
- return r;
- }
-
-};
-
-class FakeStoreAttrs {
- private:
-
- class FakeAttrSet {
- public:
- map<string, bufferptr> attrs;
-
- int getattr(const char *name, void *value, size_t size) {
- string n = name;
- if (attrs.count(n)) {
- size_t l = MIN( attrs[n].length(), size );
- bufferlist bl;
- bl.append(attrs[n]);
- bl.copy(0, l, (char*)value);
- return l;
- }
- return -1;
- }
- int getattrs(map<string,bufferptr>& aset) {
- aset = attrs;
- return 0;
- }
- int setattrs(map<string,bufferptr>& aset) {
- attrs = aset;
- return 0;
- }
-
- int setattr(const char *name, const void *value, size_t size) {
- string n = name;
- bufferptr bp = buffer::copy((char*)value, size);
- attrs[n] = bp;
- return 0;
- }
-
- int listattr(char *attrs, size_t size) {
- assert(0);
- return 0;
- }
-
- int rmattr(const char *name) {
- string n = name;
- attrs.erase(n);
- return 0;
- }
-
- bool empty() { return attrs.empty(); }
- };
-
- Mutex faker_lock;
- ObjectStore *store;
- hash_map<pobject_t, FakeAttrSet> fakeoattrs;
- hash_map<coll_t, FakeAttrSet> fakecattrs;
-
- public:
- FakeStoreAttrs(ObjectStore *s) : store(s) {}
-
- int setattr(pobject_t oid, const char *name,
- const void *value, size_t size,
- Context *onsafe=0) {
- faker_lock.Lock();
- int r = fakeoattrs[oid].setattr(name, value, size);
- if (onsafe) store->sync(onsafe);
- faker_lock.Unlock();
- return r;
- }
- int setattrs(pobject_t oid, map<string,bufferptr>& aset) {
- faker_lock.Lock();
- int r = fakeoattrs[oid].setattrs(aset);
- faker_lock.Unlock();
- return r;
- }
- int getattr(pobject_t oid, const char *name,
- void *value, size_t size) {
- faker_lock.Lock();
- int r = fakeoattrs[oid].getattr(name, value, size);
- faker_lock.Unlock();
- return r;
- }
- int getattrs(pobject_t oid, map<string,bufferptr>& aset) {
- faker_lock.Lock();
- int r = fakeoattrs[oid].getattrs(aset);
- faker_lock.Unlock();
- return r;
- }
- int rmattr(pobject_t oid, const char *name,
- Context *onsafe=0) {
- faker_lock.Lock();
- int r = fakeoattrs[oid].rmattr(name);
- if (onsafe) store->sync(onsafe);
- faker_lock.Unlock();
- return r;
- }
-
- int listattr(pobject_t oid, char *attrs, size_t size) {
- faker_lock.Lock();
- int r = fakeoattrs[oid].listattr(attrs,size);
- faker_lock.Unlock();
- return r;
- }
-
- int collection_setattr(coll_t c, const char *name,
- void *value, size_t size,
- Context *onsafe=0) {
- faker_lock.Lock();
- int r = fakecattrs[c].setattr(name, value, size);
- if (onsafe) store->sync(onsafe);
- faker_lock.Unlock();
- return r;
- }
- int collection_setattrs(coll_t cid, map<string,bufferptr>& aset) {
- faker_lock.Lock();
- int r = fakecattrs[cid].setattrs(aset);
- faker_lock.Unlock();
- return r;
- }
- int collection_getattrs(coll_t cid, map<string,bufferptr>& aset) {
- faker_lock.Lock();
- int r = fakecattrs[cid].getattrs(aset);
- faker_lock.Unlock();
- return r;
- }
- int collection_rmattr(coll_t c, const char *name,
- Context *onsafe=0) {
- faker_lock.Lock();
- int r = fakecattrs[c].rmattr(name);
- if (onsafe) store->sync(onsafe);
- faker_lock.Unlock();
- return r;
- }
- int collection_getattr(coll_t c, const char *name,
- void *value, size_t size) {
- faker_lock.Lock();
- int r = fakecattrs[c].getattr(name, value, size);
- faker_lock.Unlock();
- return r;
- }
- int collection_listattr(coll_t c, char *attrs, size_t size) {
- faker_lock.Lock();
- int r = fakecattrs[c].listattr(attrs,size);
- faker_lock.Unlock();
- return r;
- }
-
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-
-#include "FakeStore.h"
-#include "include/types.h"
-
-#include "FileJournal.h"
-
-#include "common/Timer.h"
-
-#include <unistd.h>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-#include <sys/file.h>
-#include <iostream>
-#include <cassert>
-#include <errno.h>
-#include <dirent.h>
-#include <sys/ioctl.h>
-#ifndef __CYGWIN__
-# include <sys/xattr.h>
-#endif
-
-#ifdef DARWIN
-#include <sys/param.h>
-#include <sys/mount.h>
-#endif // DARWIN
-
-
-#ifndef __CYGWIN__
-#ifndef DARWIN
-# include <linux/ioctl.h>
-# define BTRFS_IOCTL_MAGIC 0x94
-# define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6)
-# define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7)
-# define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8)
-# define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int)
-#endif
-#endif
-
-
-#include "config.h"
-
-#define dout(l) if (l<=g_conf.debug_fakestore) *_dout << dbeginl << g_clock.now() << " fakestore(" << basedir << ") "
-#define derr(l) if (l<=g_conf.debug_fakestore) *_derr << dbeginl << g_clock.now() << " fakestore(" << basedir << ") "
-
-#include "include/buffer.h"
-
-#include <map>
-
-
-/*
- * xattr portability stupidity
- */
-
-#ifdef DARWIN
-int do_getxattr(const char *fn, const char *name, void *val, size_t size) {
- return ::getxattr(fn, name, val, size, 0, 0);
-}
-int do_setxattr(const char *fn, const char *name, const void *val, size_t size) {
- return ::setxattr(fn, name, val, size, 0, 0);
-}
-int do_removexattr(const char *fn, const char *name) {
- return ::removexattr(fn, name, 0);
-}
-int do_listxattr(const char *fn, char *names, size_t len) {
- return ::listxattr(fn, names, len, 0);
-}
-#else
-int do_getxattr(const char *fn, const char *name, void *val, size_t size) {
- return ::getxattr(fn, name, val, size);
-}
-int do_setxattr(const char *fn, const char *name, const void *val, size_t size) {
- return ::setxattr(fn, name, val, size, 0);
-}
-int do_removexattr(const char *fn, const char *name) {
- return ::removexattr(fn, name);
-}
-int do_listxattr(const char *fn, char *names, size_t len) {
- return ::listxattr(fn, names, len);
-}
-
-#endif
-
-
-
-
-int FakeStore::statfs(struct statfs *buf)
-{
- if (::statfs(basedir.c_str(), buf) < 0)
- return -errno;
- return 0;
-}
-
-
-/*
- * sorry, these are sentitive to the pobject_t and coll_t typing.
- */
-void FakeStore::get_oname(pobject_t oid, char *s)
-{
- assert(sizeof(oid) == 24);
-#ifdef __LP64__
- sprintf(s, "%s/objects/%04x.%04x.%016lx.%016lx", basedir.c_str(),
- oid.volume, oid.rank,
- *((uint64_t*)&oid.oid),
- *(((uint64_t*)&oid.oid) + 1));
-#else
- sprintf(s, "%s/objects/%04x.%04x.%016llx.%016llx", basedir.c_str(),
- oid.volume, oid.rank,
- *((uint64_t*)&oid.oid),
- *(((uint64_t*)&oid.oid) + 1));
-#endif
-}
-
-pobject_t FakeStore::parse_object(char *s)
-{
- pobject_t o;
- assert(sizeof(o) == 24);
- //cout << " got object " << de->d_name << std::endl;
- o.volume = strtoll(s, 0, 16);
- assert(s[4] == '.');
- o.rank = strtoll(s+5, 0, 16);
- assert(s[9] == '.');
- *(((uint64_t*)&o.oid) + 0) = strtoll(s+10, 0, 16);
- assert(s[26] == '.');
- *(((uint64_t*)&o.oid) + 1) = strtoll(s+27, 0, 16);
- dout(0) << " got " << o << " errno " << errno << " on " << s << dendl;
- return o;
-}
-
-coll_t FakeStore::parse_coll(char *s)
-{
- return strtoll(s, 0, 16);
-}
-
-void FakeStore::get_cdir(coll_t cid, char *s)
-{
- assert(sizeof(cid) == 8);
-#ifdef __LP64__
- sprintf(s, "%s/collections/%016lx", basedir.c_str(),
- cid);
-#else
- sprintf(s, "%s/collections/%016llx", basedir.c_str(),
- cid);
-#endif
-}
-
-void FakeStore::get_coname(coll_t cid, pobject_t oid, char *s)
-{
- assert(sizeof(oid) == 24);
-#ifdef __LP64__
- sprintf(s, "%s/collections/%016lx/%04x.%04x.%016lx.%016lx", basedir.c_str(), cid,
- oid.volume, oid.rank,
- *((uint64_t*)&oid.oid),
- *(((uint64_t*)&oid.oid) + 1));
-#else
- sprintf(s, "%s/collections/%016llx/%04x.%04x.%016llx.%016llx", basedir.c_str(), cid,
- oid.volume, oid.rank,
- *((uint64_t*)&oid),
- *(((uint64_t*)&oid) + 1));
-#endif
-}
-
-
-
-
-int FakeStore::mkfs()
-{
- char cmd[200];
- if (g_conf.fakestore_dev) {
- dout(0) << "mounting" << dendl;
- sprintf(cmd,"mount %s", g_conf.fakestore_dev);
- system(cmd);
- }
-
- dout(1) << "mkfs in " << basedir << dendl;
-
- // wipe
- sprintf(cmd, "test -d %s && rm -r %s ; mkdir -p %s/collections && mkdir -p %s/objects",
- basedir.c_str(), basedir.c_str(), basedir.c_str(), basedir.c_str());
-
- dout(5) << "wipe: " << cmd << dendl;
- system(cmd);
-
- // fsid
- fsid = rand();
- char fn[100];
- sprintf(fn, "%s/fsid", basedir.c_str());
- int fd = ::open(fn, O_CREAT|O_TRUNC|O_WRONLY, 0644);
- ::write(fd, &fsid, sizeof(fsid));
- ::close(fd);
- dout(10) << "mkfs fsid is " << fsid << dendl;
-
- // journal?
- struct stat st;
- sprintf(fn, "%s.journal", basedir.c_str());
- if (::lstat(fn, &st) == 0) {
- journal = new FileJournal(fsid, &finisher, fn, g_conf.journal_dio);
- if (journal->create() < 0) {
- dout(0) << "mkfs error creating journal on " << fn << dendl;
- } else {
- dout(0) << "mkfs created journal on " << fn << dendl;
- }
- delete journal;
- journal = 0;
- } else {
- dout(10) << "mkfs no journal at " << fn << dendl;
- }
-
- if (g_conf.fakestore_dev) {
- char cmd[100];
- dout(0) << "umounting" << dendl;
- sprintf(cmd,"umount %s", g_conf.fakestore_dev);
- //system(cmd);
- }
-
- dout(1) << "mkfs done in " << basedir << dendl;
-
- return 0;
-}
-
-int FakeStore::mount()
-{
- if (g_conf.fakestore_dev) {
- dout(0) << "mounting" << dendl;
- char cmd[100];
- sprintf(cmd,"mount %s", g_conf.fakestore_dev);
- //system(cmd);
- }
-
- dout(5) << "basedir " << basedir << dendl;
-
- // make sure global base dir exists
- struct stat st;
- int r = ::stat(basedir.c_str(), &st);
- if (r != 0) {
- derr(0) << "unable to stat basedir " << basedir << ", " << strerror(errno) << dendl;
- return -errno;
- }
-
- if (g_conf.fakestore_fake_collections) {
- dout(0) << "faking collections (in memory)" << dendl;
- fake_collections = true;
- }
-
- // fake attrs?
- // let's test to see if they work.
- if (g_conf.fakestore_fake_attrs) {
- dout(0) << "faking attrs (in memory)" << dendl;
- fake_attrs = true;
- } else {
- char names[1000];
- r = do_listxattr(basedir.c_str(), names, 1000);
- if (r < 0) {
- derr(0) << "xattrs don't appear to work (" << strerror(errno) << "), specify --fakestore_fake_attrs to fake them (in memory)." << dendl;
- assert(0);
- }
- }
-
- char fn[100];
- int fd;
-
-#ifdef BTRFS_IOC_SYNC
- // is this btrfs?
- btrfs_fd = ::open(basedir.c_str(), O_DIRECTORY);
- r = ::ioctl(btrfs_fd, BTRFS_IOC_SYNC);
- if (r == 0) {
- dout(0) << "mount detected btrfs" << dendl;
- } else {
- dout(0) << "mount did NOT detect btrfs: " << strerror(-r) << dendl;
- ::close(btrfs_fd);
- btrfs_fd = -1;
- }
-#endif
-
- // get fsid
- sprintf(fn, "%s/fsid", basedir.c_str());
- fd = ::open(fn, O_RDONLY);
- ::read(fd, &fsid, sizeof(fsid));
- ::close(fd);
- dout(10) << "mount fsid is " << fsid << dendl;
-
- // get epoch
- sprintf(fn, "%s/commit_epoch", basedir.c_str());
- fd = ::open(fn, O_RDONLY);
- ::read(fd, &super_epoch, sizeof(super_epoch));
- ::close(fd);
- dout(5) << "mount epoch is " << super_epoch << dendl;
-
- // journal
- sprintf(fn, "%s.journal", basedir.c_str());
- if (::stat(fn, &st) == 0) {
- dout(10) << "mount opening journal at " << fn << dendl;
- journal = new FileJournal(fsid, &finisher, fn, g_conf.journal_dio);
- } else {
- dout(10) << "mount no journal at " << fn << dendl;
- }
- r = journal_replay();
- if (r == -EINVAL) {
- dout(0) << "mount got EINVAL on journal open, not mounting" << dendl;
- return r;
- }
- journal_start();
- sync_thread.create();
-
- // all okay.
- return 0;
-}
-
-int FakeStore::umount()
-{
- dout(5) << "umount " << basedir << dendl;
-
- sync();
- journal_stop();
-
- lock.Lock();
- stop = true;
- sync_cond.Signal();
- lock.Unlock();
- sync_thread.join();
-
- if (btrfs_fd >= 0) {
- ::close(btrfs_fd);
- btrfs_fd = -1;
- }
-
- if (g_conf.fakestore_dev) {
- char cmd[100];
- dout(0) << "umounting" << dendl;
- sprintf(cmd,"umount %s", g_conf.fakestore_dev);
- //system(cmd);
- }
-
- // nothing
- return 0;
-}
-
-
-int FakeStore::transaction_start()
-{
- if (btrfs_fd < 0)
- return 0;
-
- int fd = ::open(basedir.c_str(), O_RDONLY);
- if (fd < 0)
- derr(0) << "transaction_start got " << strerror(errno)
- << " from btrfs open" << dendl;
- if (::ioctl(fd, BTRFS_IOC_TRANS_START) < 0) {
- derr(0) << "transaction_start got " << strerror(errno)
- << " from btrfs ioctl" << dendl;
- ::close(fd);
- return -errno;
- }
- dout(10) << "transaction_start " << fd << dendl;
- return fd;
-}
-
-void FakeStore::transaction_end(int fd)
-{
- if (btrfs_fd < 0)
- return;
- dout(10) << "transaction_end " << fd << dendl;
- ::close(fd);
-}
-
-
-// --------------------
-// objects
-
-bool FakeStore::exists(pobject_t oid)
-{
- struct stat st;
- if (stat(oid, &st) == 0)
- return true;
- else
- return false;
-}
-
-int FakeStore::stat(pobject_t oid, struct stat *st)
-{
- dout(20) << "stat " << oid << dendl;
- char fn[200];
- get_oname(oid,fn);
- int r = ::stat(fn, st);
- dout(20) << "stat " << oid << " at " << fn << " = " << r << dendl;
- return r < 0 ? -errno:r;
-}
-
-
-int FakeStore::remove(pobject_t oid, Context *onsafe)
-{
- dout(20) << "remove " << oid << dendl;
- char fn[200];
- get_oname(oid,fn);
- int r = ::unlink(fn);
- if (r == 0)
- journal_remove(oid, onsafe);
- else
- delete onsafe;
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::truncate(pobject_t oid, off_t size, Context *onsafe)
-{
- dout(20) << "truncate " << oid << " size " << size << dendl;
-
- char fn[200];
- get_oname(oid,fn);
- int r = ::truncate(fn, size);
- if (r >= 0) journal_truncate(oid, size, onsafe);
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::read(pobject_t oid,
- off_t offset, size_t len,
- bufferlist& bl) {
- dout(20) << "read " << oid << " len " << len << " off " << offset << dendl;
-
- char fn[200];
- get_oname(oid,fn);
-
- int fd = ::open(fn, O_RDONLY);
- if (fd < 0) {
- dout(10) << "read couldn't open " << fn << " errno " << errno << " " << strerror(errno) << dendl;
- return -errno;
- }
- ::flock(fd, LOCK_EX); // lock for safety
-
- off_t actual = lseek(fd, offset, SEEK_SET);
- size_t got = 0;
-
- if (len == 0) {
- struct stat st;
- ::fstat(fd, &st);
- len = st.st_size;
- }
-
- if (actual == offset) {
- bufferptr bptr(len); // prealloc space for entire read
- got = ::read(fd, bptr.c_str(), len);
- bptr.set_length(got); // properly size the buffer
- if (got > 0) bl.push_back( bptr ); // put it in the target bufferlist
- }
- ::flock(fd, LOCK_UN);
- ::close(fd);
- return got;
-}
-
-
-int FakeStore::write(pobject_t oid,
- off_t offset, size_t len,
- const bufferlist& bl,
- Context *onsafe)
-{
- char fn[200];
- get_oname(oid,fn);
-
- dout(20) << "write " << fn << " len " << len << " off " << offset << dendl;
-
- int flags = O_WRONLY|O_CREAT;
- int fd = ::open(fn, flags, 0644);
- if (fd < 0) {
- derr(0) << "write couldn't open " << fn << " flags " << flags << " errno " << errno << " " << strerror(errno) << dendl;
- return -errno;
- }
- ::flock(fd, LOCK_EX); // lock for safety
-
- // seek
- off_t actual = ::lseek(fd, offset, SEEK_SET);
- int did = 0;
- assert(actual == offset);
-
- // write buffers
- for (list<bufferptr>::const_iterator it = bl.buffers().begin();
- it != bl.buffers().end();
- it++) {
- int r = ::write(fd, (char*)(*it).c_str(), (*it).length());
- if (r > 0)
- did += r;
- else {
- derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl;
- }
- }
-
- if (did < 0) {
- derr(0) << "couldn't write to " << fn << " len " << len << " off " << offset << " errno " << errno << " " << strerror(errno) << dendl;
- }
-
- ::flock(fd, LOCK_UN);
-
- // schedule sync
- if (did >= 0)
- journal_write(oid, offset, len, bl, onsafe);
- else
- delete onsafe;
-
- ::close(fd);
-
- return did;
-}
-
-int FakeStore::clone(pobject_t oldoid, pobject_t newoid)
-{
- char ofn[200], nfn[200];
- get_oname(oldoid, ofn);
- get_oname(newoid, nfn);
-
- dout(20) << "clone " << ofn << " -> " << nfn << dendl;
-
- int o = ::open(ofn, O_RDONLY);
- if (o < 0)
- return -errno;
- int n = ::open(nfn, O_CREAT|O_TRUNC|O_WRONLY, 0644);
- if (n < 0)
- return -errno;
- int r = 0;
- if (btrfs_fd >= 0)
- r = ::ioctl(n, BTRFS_IOC_CLONE, o);
- else {
- struct stat st;
- ::fstat(o, &st);
-
-#ifdef SPLICE_F_MOVE
- loff_t op = 0, np = 0;
- while (op < st.st_size && r >= 0)
- r = ::splice(o, &op, n, &np, st.st_size-op, 0);
-#else
- loff_t pos = 0;
- int buflen = 4096*10;
- char buf[buflen];
- while (pos < st.st_size) {
- int l = MIN(st.st_size-pos, buflen);
- r = ::read(o, buf, l);
- if (r < 0)
- break;
- int op = 0;
- while (op < l) {
- int r2 = ::write(n, buf+op, l-op);
-
- if (r2 < 0) { r = r2; break; }
- op += r2;
- }
- if (r < 0) break;
- pos += r;
- }
-#endif
- }
- if (r < 0)
- return -errno;
-
- ::close(n);
- ::close(o);
- return 0;
-}
-
-
-void FakeStore::sync_entry()
-{
- lock.Lock();
- utime_t interval;
- interval.set_from_double(g_conf.fakestore_sync_interval);
- while (!stop) {
- dout(20) << "sync_entry waiting for " << interval << dendl;
- sync_cond.WaitInterval(lock, interval);
- lock.Unlock();
-
- dout(20) << "sync_entry committing " << super_epoch << " " << interval << dendl;
- commit_start();
-
- // induce an fs sync.
- // we assume data=ordered or similar semantics
- char fn[100];
- sprintf(fn, "%s/commit_epoch", basedir.c_str());
- int fd = ::open(fn, O_CREAT|O_WRONLY, 0644);
- ::write(fd, &super_epoch, sizeof(super_epoch));
- ::fsync(fd); // this should cause the fs's journal to commit. (on btrfs too.)
- ::close(fd);
-
- commit_finish();
-
- lock.Lock();
- dout(20) << "sync_entry committed " << super_epoch << dendl;
- }
- lock.Unlock();
-}
-
-void FakeStore::sync()
-{
- Mutex::Locker l(lock);
- sync_cond.Signal();
-}
-
-void FakeStore::sync(Context *onsafe)
-{
- journal_sync(onsafe);
- sync();
-}
-
-
-// -------------------------------
-// attributes
-
-// objects
-
-int FakeStore::setattr(pobject_t oid, const char *name,
- const void *value, size_t size,
- Context *onsafe)
-{
- int r;
- if (fake_attrs)
- r = attrs.setattr(oid, name, value, size, onsafe);
- else {
- char fn[100];
- get_oname(oid, fn);
- r = do_setxattr(fn, name, value, size);
- }
- if (r >= 0)
- journal_setattr(oid, name, value, size, onsafe);
- else
- delete onsafe;
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::setattrs(pobject_t oid, map<string,bufferptr>& aset)
-{
- int r;
- if (fake_attrs)
- r = attrs.setattrs(oid, aset);
- else {
- char fn[100];
- get_oname(oid, fn);
- r = 0;
- for (map<string,bufferptr>::iterator p = aset.begin();
- p != aset.end();
- ++p) {
- r = do_setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length());
- if (r < 0) {
- cerr << "error setxattr " << strerror(errno) << std::endl;
- break;
- }
- }
- }
- if (r >= 0)
- journal_setattrs(oid, aset, 0);
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::getattr(pobject_t oid, const char *name,
- void *value, size_t size)
-{
- int r;
- if (fake_attrs)
- r = attrs.getattr(oid, name, value, size);
- else {
- char fn[100];
- get_oname(oid, fn);
- r = do_getxattr(fn, name, value, size);
- }
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::getattrs(pobject_t oid, map<string,bufferptr>& aset)
-{
- int r;
- if (fake_attrs)
- r = attrs.getattrs(oid, aset);
- else {
- char fn[100];
- get_oname(oid, fn);
-
- char val[1000];
- char names[1000];
- int num = do_listxattr(fn, names, 1000);
-
- char *name = names;
- for (int i=0; i<num; i++) {
- dout(0) << "getattrs " << oid << " getting " << (i+1) << "/" << num << " '" << names << "'" << dendl;
- int l = do_getxattr(fn, name, val, 1000);
- dout(0) << "getattrs " << oid << " getting " << (i+1) << "/" << num << " '" << names << "' = " << l << " bytes" << dendl;
- aset[names].append(val, l);
- name += strlen(name) + 1;
- }
- }
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::rmattr(pobject_t oid, const char *name, Context *onsafe)
-{
- int r;
- if (fake_attrs)
- r = attrs.rmattr(oid, name, onsafe);
- else {
- char fn[100];
- get_oname(oid, fn);
- r = do_removexattr(fn, name);
- }
- if (r >= 0)
- journal_rmattr(oid, name, onsafe);
- else
- delete onsafe;
- return r < 0 ? -errno:r;
-}
-
-
-
-// collections
-
-int FakeStore::collection_setattr(coll_t c, const char *name,
- void *value, size_t size,
- Context *onsafe)
-{
- int r;
- if (fake_attrs)
- r = attrs.collection_setattr(c, name, value, size, onsafe);
- else {
- char fn[200];
- get_cdir(c, fn);
- r = do_setxattr(fn, name, value, size);
- }
- if (r >= 0)
- journal_collection_setattr(c, name, value, size, onsafe);
- else
- delete onsafe;
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::collection_rmattr(coll_t c, const char *name,
- Context *onsafe)
-{
- int r;
- if (fake_attrs)
- r = attrs.collection_rmattr(c, name, onsafe);
- else {
- char fn[200];
- get_cdir(c, fn);
- r = do_removexattr(fn, name);
- }
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::collection_getattr(coll_t c, const char *name,
- void *value, size_t size)
-{
- int r;
- if (fake_attrs)
- r = attrs.collection_getattr(c, name, value, size);
- else {
- char fn[200];
- get_cdir(c, fn);
- r = do_getxattr(fn, name, value, size);
- }
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::collection_setattrs(coll_t cid, map<string,bufferptr>& aset)
-{
- int r;
- if (fake_attrs)
- r = attrs.collection_setattrs(cid, aset);
- else {
- char fn[100];
- get_cdir(cid, fn);
- int r = 0;
- for (map<string,bufferptr>::iterator p = aset.begin();
- p != aset.end();
- ++p) {
- r = do_setxattr(fn, p->first.c_str(), p->second.c_str(), p->second.length());
- if (r < 0) break;
- }
- }
- if (r >= 0)
- journal_collection_setattrs(cid, aset, 0);
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::collection_getattrs(coll_t cid, map<string,bufferptr>& aset)
-{
- int r;
- if (fake_attrs)
- r = attrs.collection_getattrs(cid, aset);
- else {
- char fn[100];
- get_cdir(cid, fn);
-
- char val[1000];
- char names[1000];
- int num = do_listxattr(fn, names, 1000);
-
- char *name = names;
- for (int i=0; i<num; i++) {
- dout(0) << "getattrs " << cid << " getting " << (i+1) << "/" << num << " '" << names << "'" << dendl;
- int l = do_getxattr(fn, name, val, 1000);
- dout(0) << "getattrs " << cid << " getting " << (i+1) << "/" << num << " '" << names << "' = " << l << " bytes" << dendl;
- aset[names].append(val, l);
- name += strlen(name) + 1;
- }
- r = 0;
- }
- return r < 0 ? -errno:r;
-}
-
-
-/*
-int FakeStore::collection_listattr(coll_t c, char *attrs, size_t size)
-{
- if (fake_attrs) return collection_listattr(c, attrs, size);
- return 0;
-}
-*/
-
-
-int FakeStore::list_objects(list<pobject_t>& ls)
-{
- char fn[200];
- sprintf(fn, "%s/objects", basedir.c_str());
-
- DIR *dir = ::opendir(fn);
- assert(dir);
-
- struct dirent *de;
- while ((de = ::readdir(dir)) != 0) {
- if (de->d_name[0] == '.') continue;
- // parse
- pobject_t o = parse_object(de->d_name);
- if (errno) continue;
- ls.push_back(o);
- }
-
- ::closedir(dir);
- return 0;
-}
-
-
-// --------------------------
-// collections
-
-int FakeStore::list_collections(list<coll_t>& ls)
-{
- if (fake_collections) return collections.list_collections(ls);
-
- char fn[200];
- sprintf(fn, "%s/collections", basedir.c_str());
-
- DIR *dir = ::opendir(fn);
- assert(dir);
-
- struct dirent *de;
- while ((de = ::readdir(dir)) != 0) {
- // parse
- errno = 0;
- coll_t c = parse_coll(de->d_name);
- if (c) ls.push_back(c);
- }
-
- ::closedir(dir);
- return 0;
-}
-
-int FakeStore::create_collection(coll_t c,
- Context *onsafe)
-{
- if (fake_collections) return collections.create_collection(c, onsafe);
-
- char fn[200];
- get_cdir(c, fn);
-
- int r = ::mkdir(fn, 0755);
-
- if (r >= 0)
- journal_create_collection(c, onsafe);
- else
- delete onsafe;
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::destroy_collection(coll_t c,
- Context *onsafe)
-{
- if (fake_collections) return collections.destroy_collection(c, onsafe);
-
- char fn[200];
- get_cdir(c, fn);
- char cmd[200];
- sprintf(cmd, "test -d %s && rm -r %s", fn, fn);
- system(cmd);
- int r = 0; // fixme
-
- if (r >= 0)
- journal_destroy_collection(c, onsafe);
- else
- delete onsafe;
- return 0;
-}
-
-int FakeStore::collection_stat(coll_t c, struct stat *st)
-{
- if (fake_collections) return collections.collection_stat(c, st);
-
- char fn[200];
- get_cdir(c, fn);
- int r = ::lstat(fn, st);
- return r < 0 ? -errno:r;
-}
-
-bool FakeStore::collection_exists(coll_t c)
-{
- if (fake_collections) return collections.collection_exists(c);
-
- struct stat st;
- return collection_stat(c, &st) == 0;
-}
-
-
-int FakeStore::collection_add(coll_t c, pobject_t o,
- Context *onsafe)
-{
- int r;
- if (fake_collections)
- r = collections.collection_add(c, o, onsafe);
- else {
- char cof[200];
- get_coname(c, o, cof);
- char of[200];
- get_oname(o, of);
- r = ::link(of, cof);
- }
- if (r >= 0)
- journal_collection_add(c, o, onsafe);
- else
- delete onsafe;
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::collection_remove(coll_t c, pobject_t o,
- Context *onsafe)
-{
- int r;
- if (fake_collections)
- r = collections.collection_remove(c, o, onsafe);
- else {
- char cof[200];
- get_coname(c, o, cof);
- r = ::unlink(cof);
- }
- if (r >= 0)
- journal_collection_remove(c, o, onsafe);
- else
- delete onsafe;
- return r < 0 ? -errno:r;
-}
-
-int FakeStore::collection_list(coll_t c, list<pobject_t>& ls)
-{
- if (fake_collections) return collections.collection_list(c, ls);
-
- char fn[200];
- get_cdir(c, fn);
-
- DIR *dir = ::opendir(fn);
- assert(dir);
-
- struct dirent *de;
- while ((de = ::readdir(dir)) != 0) {
- // parse
- if (de->d_name[0] == '.') continue;
- //cout << " got object " << de->d_name << std::endl;
- pobject_t o = parse_object(de->d_name);
- if (errno) continue;
- ls.push_back(o);
- }
-
- ::closedir(dir);
- return 0;
-}
-
-// eof.
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __FAKESTORE_H
-#define __FAKESTORE_H
-
-#include "ObjectStore.h"
-#include "JournalingObjectStore.h"
-#include "common/ThreadPool.h"
-#include "common/Mutex.h"
-
-#include "Fake.h"
-//#include "FakeStoreBDBCollections.h"
-
-
-#include <map>
-using namespace std;
-
-#include <ext/hash_map>
-using namespace __gnu_cxx;
-
-
-// fake attributes in memory, if we need to.
-
-class FakeStore : public JournalingObjectStore {
- string basedir;
- __u64 fsid;
-
- int btrfs_fd; // >= if btrfs
-
- // fake attrs?
- FakeStoreAttrs attrs;
- bool fake_attrs;
-
- // fake collections?
- FakeStoreCollections collections;
- bool fake_collections;
-
- // helper fns
- void get_oname(pobject_t oid, char *s);
- void get_cdir(coll_t cid, char *s);
- void get_coname(coll_t cid, pobject_t oid, char *s);
- pobject_t parse_object(char *s);
- coll_t parse_coll(char *s);
-
- // sync thread
- Mutex lock;
- Cond sync_cond;
- bool stop;
- void sync_entry();
- struct SyncThread : public Thread {
- FakeStore *fs;
- SyncThread(FakeStore *f) : fs(f) {}
- void *entry() {
- fs->sync_entry();
- return 0;
- }
- } sync_thread;
-
- void sync_fs(); // actuall sync underlying fs
-
- public:
- FakeStore(const char *base) :
- basedir(base),
- btrfs_fd(-1),
- attrs(this), fake_attrs(false),
- collections(this), fake_collections(false),
- stop(false), sync_thread(this) { }
-
- int mount();
- int umount();
- int mkfs();
-
- int transaction_start();
- void transaction_end(int id);
-
- int statfs(struct statfs *buf);
-
- // ------------------
- // objects
- int pick_object_revision_lt(pobject_t& oid) {
- return 0;
- }
- bool exists(pobject_t oid);
- int stat(pobject_t oid, struct stat *st);
- int remove(pobject_t oid, Context *onsafe);
- int truncate(pobject_t oid, off_t size, Context *onsafe);
- int read(pobject_t oid, off_t offset, size_t len, bufferlist& bl);
- int write(pobject_t oid, off_t offset, size_t len, const bufferlist& bl, Context *onsafe);
- int clone(pobject_t oldoid, pobject_t newoid);
-
- void sync();
- void sync(Context *onsafe);
-
- int list_objects(list<pobject_t>& ls);
-
- // attrs
- int setattr(pobject_t oid, const char *name, const void *value, size_t size, Context *onsafe=0);
- int setattrs(pobject_t oid, map<string,bufferptr>& aset);
- int getattr(pobject_t oid, const char *name, void *value, size_t size);
- int getattrs(pobject_t oid, map<string,bufferptr>& aset);
- int rmattr(pobject_t oid, const char *name, Context *onsafe=0);
- //int listattr(pobject_t oid, char *attrs, size_t size);
- int collection_setattr(coll_t c, const char *name, void *value, size_t size, Context *onsafe=0);
- int collection_rmattr(coll_t c, const char *name, Context *onsafe=0);
- int collection_getattr(coll_t c, const char *name, void *value, size_t size);
- //int collection_listattr(coll_t c, char *attrs, size_t size);
- int collection_getattrs(coll_t cid, map<string,bufferptr> &aset);
- int collection_setattrs(coll_t cid, map<string,bufferptr> &aset);
-
- // collections
- int list_collections(list<coll_t>& ls);
- int create_collection(coll_t c, Context *onsafe=0);
- int destroy_collection(coll_t c, Context *onsafe=0);
- int collection_stat(coll_t c, struct stat *st);
- bool collection_exists(coll_t c);
- int collection_add(coll_t c, pobject_t o, Context *onsafe=0);
- int collection_remove(coll_t c, pobject_t o, Context *onsafe=0);
- int collection_list(coll_t c, list<pobject_t>& o);
-
-
-
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __FAKESTOREBDBCOLLECTIONS_H
-#define __FAKESTOREBDBCOLLECTIONS_H
-
-#include "BDBMap.h"
-#include "ObjectStore.h"
-#include "common/Mutex.h"
-
-#define BDBHASH_DIRS 128LL
-#define BDBHASH_FUNC(x) (((x) ^ ((x)>>30) ^ ((x)>>18) ^ ((x)>>45) ^ 0xdead1234) * 884811 % BDBHASH_DIRS)
-
-class FakeStoreBDBCollections {
- private:
- int whoami;
- string basedir;
-
- Mutex bdblock;
-
- // collection dbs
- BDBMap<coll_t, int> collections;
- map<coll_t, BDBMap<object_t, int>*> collection_map;
-
- // dirs
- void get_dir(string& dir) {
- char s[30];
- sprintf(s, "%d", whoami);
- dir = basedir + "/" + s;
- }
- void get_collfn(coll_t c, string &fn) {
- char s[100];
- sprintf(s, "%d/%02llx/%016llx.co", whoami, BDBHASH_FUNC(c), c);
- fn = basedir + "/" + s;
- }
-
- void open_collections() {
- string cfn;
- get_dir(cfn);
- cfn += "/collections";
- collections.open(cfn.c_str());
- list<coll_t> ls;
- collections.list_keys(ls);
- }
- void close_collections() {
- if (collections.is_open())
- collections.close();
-
- for (map<coll_t, BDBMap<object_t, int>*>::iterator it = collection_map.begin();
- it != collection_map.end();
- it++) {
- it->second->close();
- }
- collection_map.clear();
- }
-
- int open_collection(coll_t c) {
- if (collection_map.count(c))
- return 0; // already open.
-
- string fn;
- get_collfn(c,fn);
- collection_map[c] = new BDBMap<coll_t,int>;
- int r = collection_map[c]->open(fn.c_str());
- if (r != 0)
- collection_map.erase(c); // failed
- return r;
- }
-
- public:
- FakeStoreBDBCollections(int w, string& bd) : whoami(w), basedir(bd) {}
- ~FakeStoreBDBCollections() {
- close_collections();
- }
-
- int list_collections(list<coll_t>& ls) {
- bdblock.Lock();
- if (!collections.is_open()) open_collections();
-
- ls.clear();
- collections.list_keys(ls);
- bdblock.Unlock();
- return 0;
- }
- int create_collection(coll_t c) {
- bdblock.Lock();
- if (!collections.is_open()) open_collections();
-
- collections.put(c, 1);
- open_collection(c);
- bdblock.Unlock();
- return 0;
- }
- int destroy_collection(coll_t c) {
- bdblock.Lock();
- if (!collections.is_open()) open_collections();
-
- collections.del(c);
-
- open_collection(c);
- collection_map[c]->close();
-
- string fn;
- get_collfn(c,fn);
- collection_map[c]->remove(fn.c_str());
- delete collection_map[c];
- collection_map.erase(c);
- bdblock.Unlock();
- return 0;
- }
- int collection_stat(coll_t c, struct stat *st) {
- bdblock.Lock();
- if (!collections.is_open()) open_collections();
-
- string fn;
- get_collfn(c,fn);
- int r = ::stat(fn.c_str(), st);
- bdblock.Unlock();
- return r;
- }
- bool collection_exists(coll_t c) {
- bdblock.Lock();
- struct stat st;
- int r = collection_stat(c, &st) == 0;
- bdblock.Unlock();
- return r;
- }
- int collection_add(coll_t c, object_t o) {
- bdblock.Lock();
- if (!collections.is_open()) open_collections();
-
- open_collection(c);
- collection_map[c]->put(o,1);
- bdblock.Unlock();
- return 0;
- }
- int collection_remove(coll_t c, object_t o) {
- bdblock.Lock();
- if (!collections.is_open()) open_collections();
-
- open_collection(c);
- collection_map[c]->del(o);
- bdblock.Unlock();
- return 0;
- }
- int collection_list(coll_t c, list<object_t>& o) {
- bdblock.Lock();
- if (!collections.is_open()) open_collections();
-
- open_collection(c);
- collection_map[c]->list_keys(o);
- bdblock.Unlock();
- return 0;
- }
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#include "FileJournal.h"
-
-#include <stdio.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <fcntl.h>
-
-
-#include "config.h"
-
-#define dout(x) if (x <= g_conf.debug_journal) *_dout << dbeginl << g_clock.now() << " journal "
-#define derr(x) if (x <= g_conf.debug_journal) *_derr << dbeginl << g_clock.now() << " journal "
-
-
-int FileJournal::_open(bool forwrite)
-{
- int flags;
-
- if (forwrite) {
- flags = O_RDWR;
- if (directio) flags |= O_DIRECT;
- } else {
- flags = O_RDONLY;
- }
-
- if (fd >= 0)
- ::close(fd);
- fd = ::open(fn.c_str(), flags);
- if (fd < 0) {
- dout(2) << "_open failed " << errno << " " << strerror(errno) << dendl;
- return -errno;
- }
-
- // get size
- struct stat st;
- int r = ::fstat(fd, &st);
- assert(r == 0);
- max_size = st.st_size;
- block_size = st.st_blksize;
- dout(2) << "_open " << fn << " fd " << fd
- << ": " << st.st_size << " bytes, block size " << block_size << dendl;
-
- return 0;
-}
-
-int FileJournal::create()
-{
- dout(2) << "create " << fn << dendl;
-
- int err = _open(true);
- if (err < 0) return err;
-
- // write empty header
- memset(&header, 0, sizeof(header));
- header.clear();
- header.fsid = fsid;
- header.max_size = max_size;
- header.block_size = block_size;
- if (directio)
- header.alignment = block_size;
- else
- header.alignment = 16; // at least stay word aligned on 64bit machines...
- print_header();
-
- buffer::ptr bp = prepare_header();
- int r = ::pwrite(fd, bp.c_str(), bp.length(), 0);
- if (r < 0) {
- dout(0) << "create write header error " << errno << " " << strerror(errno) << dendl;
- return -errno;
- }
-
- ::close(fd);
- fd = -1;
- dout(2) << "create done" << dendl;
- return 0;
-}
-
-int FileJournal::open(epoch_t epoch)
-{
- dout(2) << "open " << fn << dendl;
-
- int err = _open(false);
- if (err < 0) return err;
-
- // assume writeable, unless...
- read_pos = 0;
- write_pos = get_top();
-
- // read header?
- read_header();
- dout(10) << "open journal header.fsid = " << header.fsid
- //<< " vs expected fsid = " << fsid
- << dendl;
- if (header.fsid != fsid) {
- dout(2) << "open journal fsid doesn't match, invalid (someone else's?) journal" << dendl;
- err = -EINVAL;
- }
- if (header.max_size > max_size) {
- dout(2) << "open journal size " << header.max_size << " > current " << max_size << dendl;
- err = -EINVAL;
- }
- if (header.block_size != block_size) {
- dout(2) << "open journal block size " << header.block_size << " != current " << block_size << dendl;
- err = -EINVAL;
- }
- if (header.alignment != block_size && directio) {
- derr(0) << "open journal alignment " << header.alignment << " does not match block size "
- << block_size << " (required for direct_io journal mode)" << dendl;
- err = -EINVAL;
- }
- if (err)
- return err;
-
- // looks like a valid header.
- write_pos = 0; // not writeable yet
- read_pos = 0;
-
- if (header.num > 0) {
- // pick an offset
- for (int i=0; i<header.num; i++) {
- if (header.epoch[i] == epoch) {
- dout(2) << "using read_pos header pointer "
- << header.epoch[i] << " at " << header.offset[i]
- << dendl;
- read_pos = header.offset[i];
- write_pos = 0;
- break;
- }
- else if (header.epoch[i] < epoch) {
- dout(2) << "super_epoch is " << epoch
- << ", skipping old " << header.epoch[i] << " at " << header.offset[i]
- << dendl;
- }
- else if (header.epoch[i] > epoch) {
- dout(2) << "super_epoch is " << epoch
- << ", but wtf, journal is later " << header.epoch[i] << " at " << header.offset[i]
- << dendl;
- break;
- }
- }
-
- if (read_pos == 0) {
- dout(0) << "no valid journal segments" << dendl;
- return 0; //hrm return -EINVAL;
- }
-
- } else {
- dout(0) << "journal was empty" << dendl;
- read_pos = -1;
- }
-
- return 0;
-}
-
-void FileJournal::close()
-{
- dout(1) << "close " << fn << dendl;
-
- // stop writer thread
- stop_writer();
-
- // close
- assert(writeq.empty());
- assert(commitq.empty());
- assert(fd > 0);
- ::close(fd);
- fd = -1;
-}
-
-void FileJournal::start_writer()
-{
- write_stop = false;
- write_thread.create();
-}
-
-void FileJournal::stop_writer()
-{
- write_lock.Lock();
- {
- write_stop = true;
- write_cond.Signal();
- }
- write_lock.Unlock();
- write_thread.join();
-}
-
-
-void FileJournal::print_header()
-{
- for (int i=0; i<header.num; i++) {
- if (i && header.offset[i] < header.offset[i-1]) {
- assert(header.wrap);
- dout(10) << "header: wrap at " << header.wrap << dendl;
- }
- dout(10) << "header: epoch " << header.epoch[i] << " at " << header.offset[i] << dendl;
- }
- //if (header.wrap) dout(10) << "header: wrap at " << header.wrap << dendl;
-}
-
-void FileJournal::read_header()
-{
- int r;
- dout(10) << "read_header" << dendl;
- if (directio) {
- buffer::ptr bp = buffer::create_page_aligned(block_size);
- bp.zero();
- r = ::pread(fd, bp.c_str(), bp.length(), 0);
- memcpy(&header, bp.c_str(), sizeof(header));
- } else {
- memset(&header, 0, sizeof(header)); // zero out (read may fail)
- r = ::pread(fd, &header, sizeof(header), 0);
- }
- if (r < 0)
- dout(0) << "read_header error " << errno << " " << strerror(errno) << dendl;
- print_header();
-}
-
-bufferptr FileJournal::prepare_header()
-{
- bufferptr bp;
- if (directio) {
- bp = buffer::create_page_aligned(block_size);
- bp.zero();
- memcpy(bp.c_str(), &header, sizeof(header));
- } else {
- bp = buffer::create(sizeof(header));
- memcpy(bp.c_str(), &header, sizeof(header));
- }
- return bp;
-}
-
-
-
-
-void FileJournal::check_for_wrap(epoch_t epoch, off64_t pos, off64_t size)
-{
- // epoch boundary?
- dout(10) << "check_for_wrap epoch " << epoch << " last " << header.last_epoch() << " of " << header.num << dendl;
- if (epoch > header.last_epoch()) {
- dout(10) << "saw an epoch boundary " << header.last_epoch() << " -> " << epoch << dendl;
- header.push(epoch, pos);
- must_write_header = true;
- }
-
- // does it fit?
- if (header.wrap) {
- // we're wrapped. don't overwrite ourselves.
- if (pos + size >= header.offset[0]) {
- dout(10) << "JOURNAL FULL (and wrapped), " << pos << "+" << size
- << " >= " << header.offset[0]
- << dendl;
- full = true;
- writeq.clear();
- print_header();
- }
- } else {
- // we haven't wrapped.
- if (pos + size >= header.max_size) {
- // is there room if we wrap?
- if (get_top() + size < header.offset[0]) {
- // yes!
- dout(10) << "wrapped from " << pos << " to " << get_top() << dendl;
- header.wrap = pos;
- pos = get_top();
- header.push(epoch, pos);
- must_write_header = true;
- } else {
- // no room.
- dout(10) << "submit_entry JOURNAL FULL (and can't wrap), " << pos << "+" << size
- << " >= " << header.max_size
- << dendl;
- full = true;
- writeq.clear();
- }
- }
- }
-}
-
-
-void FileJournal::prepare_multi_write(bufferlist& bl)
-{
- // gather queued writes
- off64_t queue_pos = write_pos;
-
- int eleft = g_conf.journal_max_write_entries;
- int bleft = g_conf.journal_max_write_bytes;
-
- while (!writeq.empty()) {
- // grab next item
- epoch_t epoch = writeq.front().first;
- bufferlist &ebl = writeq.front().second;
- off64_t size = 2*sizeof(entry_header_t) + ebl.length();
-
- if (bl.length() > 0 && bleft > 0 && bleft < size) break;
-
- check_for_wrap(epoch, queue_pos, size);
- if (full) break;
- if (bl.length() && must_write_header)
- break;
-
- // add to write buffer
- dout(15) << "prepare_multi_write will write " << queue_pos << " : "
- << ebl.length() << " epoch " << epoch << " -> " << size << dendl;
-
- // add it this entry
- entry_header_t h;
- h.epoch = epoch;
- h.len = ebl.length();
- h.make_magic(queue_pos, header.fsid);
- bl.append((const char*)&h, sizeof(h));
- bl.claim_append(ebl);
- bl.append((const char*)&h, sizeof(h));
-
- Context *oncommit = commitq.front();
- if (oncommit)
- writingq.push_back(oncommit);
-
- // pop from writeq
- writeq.pop_front();
- commitq.pop_front();
-
- queue_pos += size;
- if (--eleft == 0) break;
- bleft -= size;
- if (bleft == 0) break;
- }
-}
-
-bool FileJournal::prepare_single_dio_write(bufferlist& bl)
-{
- // grab next item
- epoch_t epoch = writeq.front().first;
- bufferlist &ebl = writeq.front().second;
-
- off64_t size = 2*sizeof(entry_header_t) + ebl.length();
- size = ROUND_UP_TO(size, header.alignment);
-
- check_for_wrap(epoch, write_pos, size);
- if (full) return false;
-
- // build it
- dout(15) << "prepare_single_dio_write will write " << write_pos << " : "
- << ebl.length() << " epoch " << epoch << " -> " << size << dendl;
-
- bufferptr bp = buffer::create_page_aligned(size);
- entry_header_t *h = (entry_header_t*)bp.c_str();
- h->epoch = epoch;
- h->len = ebl.length();
- h->make_magic(write_pos, header.fsid);
- ebl.copy(0, ebl.length(), bp.c_str()+sizeof(*h));
- memcpy(bp.c_str() + sizeof(*h) + ebl.length(), h, sizeof(*h));
- bl.push_back(bp);
-
- Context *oncommit = commitq.front();
- if (oncommit)
- writingq.push_back(oncommit);
-
- // pop from writeq
- writeq.pop_front();
- commitq.pop_front();
- return true;
-}
-
-void FileJournal::do_write(bufferlist& bl)
-{
- // nothing to do?
- if (bl.length() == 0 && !must_write_header)
- return;
-
- buffer::ptr hbp;
- if (must_write_header)
- hbp = prepare_header();
-
- writing = true;
-
- header_t old_header = header;
-
- write_lock.Unlock();
-
- dout(15) << "do_write writing " << write_pos << "~" << bl.length()
- << (must_write_header ? " + header":"")
- << dendl;
-
- // header
- if (hbp.length())
- ::pwrite(fd, hbp.c_str(), hbp.length(), 0);
-
- // entry
-#ifdef DARWIN
- off_t pos = write_pos;
- ::lseek(fd, write_pos, SEEK_SET);
-#else
- off64_t pos = write_pos;
- ::lseek64(fd, write_pos, SEEK_SET);
-#endif
- for (list<bufferptr>::const_iterator it = bl.buffers().begin();
- it != bl.buffers().end();
- it++) {
- if ((*it).length() == 0) continue; // blank buffer.
- int r = ::write(fd, (char*)(*it).c_str(), (*it).length());
- if (r < 0)
- derr(0) << "do_write failed with " << errno << " " << strerror(errno)
- << " with " << (void*)(*it).c_str() << " len " << (*it).length()
- << dendl;
- pos += (*it).length();
- }
-#ifdef DARWIN
- if (!directio)
- ::fsync(fd);
-#else
- if (!directio)
- ::fdatasync(fd);
-#endif
-
-
- write_lock.Lock();
-
- writing = false;
- if (memcmp(&old_header, &header, sizeof(header)) == 0) {
- write_pos += bl.length();
- write_pos = ROUND_UP_TO(write_pos, header.alignment);
- finisher->queue(writingq);
- } else {
- dout(10) << "do_write finished write but header changed? not moving write_pos." << dendl;
- derr(0) << "do_write finished write but header changed? not moving write_pos." << dendl;
- assert(writingq.empty());
- }
-}
-
-
-void FileJournal::write_thread_entry()
-{
- dout(10) << "write_thread_entry start" << dendl;
- write_lock.Lock();
-
- while (!write_stop) {
- if (writeq.empty()) {
- // sleep
- dout(20) << "write_thread_entry going to sleep" << dendl;
- write_cond.Wait(write_lock);
- dout(20) << "write_thread_entry woke up" << dendl;
- continue;
- }
-
- bufferlist bl;
- must_write_header = false;
- if (directio)
- prepare_single_dio_write(bl);
- else
- prepare_multi_write(bl);
- do_write(bl);
- }
-
- write_lock.Unlock();
- dout(10) << "write_thread_entry finish" << dendl;
-}
-
-
-bool FileJournal::is_full()
-{
- Mutex::Locker locker(write_lock);
- return full;
-}
-
-void FileJournal::submit_entry(epoch_t epoch, bufferlist& e, Context *oncommit)
-{
- Mutex::Locker locker(write_lock); // ** lock **
-
- // dump on queue
- dout(10) << "submit_entry " << e.length()
- << " epoch " << epoch
- << " " << oncommit << dendl;
- commitq.push_back(oncommit);
- if (!full) {
- writeq.push_back(pair<epoch_t,bufferlist>(epoch, e));
- write_cond.Signal(); // kick writer thread
- }
-}
-
-
-void FileJournal::commit_epoch_start(epoch_t new_epoch)
-{
- dout(10) << "commit_epoch_start on " << new_epoch-1
- << " -- new epoch " << new_epoch
- << dendl;
-
- Mutex::Locker locker(write_lock);
-
- // was full -> empty -> now usable?
- if (full) {
- if (header.num != 0) {
- dout(1) << " journal FULL, ignoring this epoch" << dendl;
- return;
- }
-
- dout(1) << " clearing FULL flag, journal now usable" << dendl;
- full = false;
- }
-}
-
-void FileJournal::commit_epoch_finish(epoch_t new_epoch)
-{
- dout(10) << "commit_epoch_finish committed " << (new_epoch-1) << dendl;
-
- Mutex::Locker locker(write_lock);
-
- if (full) {
- // full journal damage control.
- dout(15) << " journal was FULL, contents now committed, clearing header. journal still not usable until next epoch." << dendl;
- header.clear();
- write_pos = get_top();
- } else {
- // update header -- trim/discard old (committed) epochs
- print_header();
- while (header.num && header.epoch[0] < new_epoch) {
- dout(10) << " popping epoch " << header.epoch[0] << " < " << new_epoch << dendl;
- header.pop();
- }
- if (header.num == 0) {
- dout(10) << " starting fresh" << dendl;
- write_pos = get_top();
- header.push(new_epoch, write_pos);
- }
- }
- must_write_header = true;
-
- // discard any unwritten items in previous epoch
- while (!writeq.empty() && writeq.front().first < new_epoch) {
- dout(15) << " dropping unwritten and committed "
- << write_pos << " : " << writeq.front().second.length()
- << " epoch " << writeq.front().first
- << dendl;
- // finisher?
- Context *oncommit = commitq.front();
- if (oncommit) writingq.push_back(oncommit);
-
- // discard.
- writeq.pop_front();
- commitq.pop_front();
- }
-
- // queue the finishers
- finisher->queue(writingq);
- dout(10) << "commit_epoch_finish done" << dendl;
-}
-
-
-void FileJournal::make_writeable()
-{
- _open(true);
-
- if (read_pos > 0)
- write_pos = read_pos;
- else
- write_pos = get_top();
- read_pos = 0;
-
- must_write_header = true;
- start_writer();
-}
-
-
-bool FileJournal::read_entry(bufferlist& bl, epoch_t& epoch)
-{
- if (!read_pos) {
- dout(2) << "read_entry -- not readable" << dendl;
- return false;
- }
-
- if (read_pos == header.wrap) {
- // find wrap point
- for (int i=1; i<header.num; i++) {
- if (header.offset[i] < read_pos) {
- assert(header.offset[i-1] < read_pos);
- read_pos = header.offset[i];
- break;
- }
- }
- assert(read_pos != header.wrap);
- dout(10) << "read_entry wrapped from " << header.wrap << " to " << read_pos << dendl;
- }
-
- // header
- entry_header_t h;
-#ifdef DARWIN
- ::lseek(fd, read_pos, SEEK_SET);
-#else
- ::lseek64(fd, read_pos, SEEK_SET);
-#endif
- ::read(fd, &h, sizeof(h));
- if (!h.check_magic(read_pos, header.fsid)) {
- dout(2) << "read_entry " << read_pos << " : bad header magic, end of journal" << dendl;
- return false;
- }
-
- // body
- bufferptr bp(h.len);
- ::read(fd, bp.c_str(), h.len);
-
- // footer
- entry_header_t f;
- ::read(fd, &f, sizeof(h));
- if (!f.check_magic(read_pos, header.fsid) ||
- h.epoch != f.epoch ||
- h.len != f.len) {
- dout(2) << "read_entry " << read_pos << " : bad footer magic, partial entry, end of journal" << dendl;
- return false;
- }
-
-
- // yay!
- dout(1) << "read_entry " << read_pos << " : "
- << " " << h.len << " bytes"
- << " epoch " << h.epoch
- << dendl;
-
- bl.push_back(bp);
- epoch = h.epoch;
-
- read_pos += 2*sizeof(entry_header_t) + h.len;
- read_pos = ROUND_UP_TO(read_pos, header.alignment);
-
- return true;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __EBOFS_FILEJOURNAL_H
-#define __EBOFS_FILEJOURNAL_H
-
-
-#include "Journal.h"
-#include "common/Cond.h"
-#include "common/Mutex.h"
-#include "common/Thread.h"
-
-class FileJournal : public Journal {
-public:
- /** log header
- * we allow 4 pointers:
- * top/initial,
- * one for an epoch boundary (if any),
- * one for a wrap in the ring buffer/journal file,
- * one for a second epoch boundary (if any).
- * the epoch boundary one is useful only for speedier recovery in certain cases
- * (i.e. when ebofs committed, but the journal didn't rollover ... very small window!)
- */
- struct header_t {
- __u64 fsid;
- __s64 num;
- __u32 block_size;
- __u32 alignment;
- __s64 max_size;
- __s64 wrap;
- __u32 epoch[4];
- __s64 offset[4];
-
- header_t() : fsid(0), num(0), block_size(0), alignment(0), max_size(0), wrap(0) {}
-
- void clear() {
- num = 0;
- wrap = 0;
- }
- void pop() {
- if (num >= 2 && offset[0] > offset[1])
- wrap = 0; // we're eliminating a wrap
- num--;
- for (int i=0; i<num; i++) {
- epoch[i] = epoch[i+1];
- offset[i] = offset[i+1];
- }
- }
- void push(epoch_t e, off64_t o) {
- assert(num < 4);
- if (num > 2 &&
- epoch[num-1] == e &&
- epoch[num-2] == (e-1))
- num--; // tail was an epoch boundary; replace it.
- epoch[num] = e;
- offset[num] = o;
- num++;
- }
- epoch_t last_epoch() {
- if (num)
- return epoch[num-1];
- else
- return 0;
- }
- } header;
-
- struct entry_header_t {
- uint64_t epoch;
- uint64_t len;
- uint64_t magic1;
- uint64_t magic2;
-
- void make_magic(off64_t pos, uint64_t fsid) {
- magic1 = pos;
- magic2 = fsid ^ epoch ^ len;
- }
- bool check_magic(off64_t pos, uint64_t fsid) {
- return
- magic1 == (uint64_t)pos &&
- magic2 == (fsid ^ epoch ^ len);
- }
- };
-
-private:
- string fn;
-
- off64_t max_size;
- size_t block_size;
- bool directio;
- bool full, writing, must_write_header;
- off64_t write_pos; // byte where next entry written goes
- off64_t read_pos; //
-
- int fd;
-
- // to be journaled
- list<pair<epoch_t,bufferlist> > writeq;
- list<Context*> commitq;
-
- // being journaled
- list<Context*> writingq;
-
- // write thread
- Mutex write_lock;
- Cond write_cond;
- bool write_stop;
-
- int _open(bool wr);
- void print_header();
- void read_header();
- bufferptr prepare_header();
- void start_writer();
- void stop_writer();
- void write_thread_entry();
-
- void check_for_wrap(epoch_t epoch, off64_t pos, off64_t size);
- bool prepare_single_dio_write(bufferlist& bl);
- void prepare_multi_write(bufferlist& bl);
- void do_write(bufferlist& bl);
-
- class Writer : public Thread {
- FileJournal *journal;
- public:
- Writer(FileJournal *fj) : journal(fj) {}
- void *entry() {
- journal->write_thread_entry();
- return 0;
- }
- } write_thread;
-
- off64_t get_top() {
- if (directio)
- return block_size;
- else
- return sizeof(header);
- }
-
- public:
- FileJournal(__u64 fsid, Finisher *fin, const char *f, bool dio=false) :
- Journal(fsid, fin), fn(f),
- max_size(0), block_size(0),
- directio(dio),
- full(false), writing(false), must_write_header(false),
- write_pos(0), read_pos(0),
- fd(-1),
- write_stop(false), write_thread(this) { }
- ~FileJournal() {}
-
- int create();
- int open(epoch_t epoch);
- void close();
-
- bool is_writeable() {
- return read_pos == 0;
- }
- void make_writeable();
-
- // writes
- void submit_entry(epoch_t epoch, bufferlist& e, Context *oncommit); // submit an item
- void commit_epoch_start(epoch_t); // mark epoch boundary
- void commit_epoch_finish(epoch_t); // mark prior epoch as committed (we can expire)
-
- bool read_entry(bufferlist& bl, epoch_t& e);
-
- bool is_full();
-
- // reads
-};
-
-#endif
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __EBOFS_JOURNAL_H
-#define __EBOFS_JOURNAL_H
-
-#include "include/buffer.h"
-#include "include/Context.h"
-#include "common/Finisher.h"
-
-class Journal {
-protected:
- __u64 fsid;
- Finisher *finisher;
-
-public:
- Journal(__u64 f, Finisher *fin) : fsid(f), finisher(fin) { }
- virtual ~Journal() { }
-
- virtual int create() = 0;
- virtual int open(epoch_t epoch) = 0;
- virtual void close() = 0;
-
- // writes
- virtual bool is_writeable() = 0;
- virtual void make_writeable() = 0;
- virtual void submit_entry(epoch_t epoch, bufferlist& e, Context *oncommit) = 0;
- virtual void commit_epoch_start(epoch_t) = 0; // mark epoch boundary
- virtual void commit_epoch_finish(epoch_t) = 0; // mark prior epoch as committed (we can expire)
- virtual bool read_entry(bufferlist& bl, epoch_t &e) = 0;
- virtual bool is_full() = 0;
-
- // reads/recovery
-
-};
-
-#endif
+++ /dev/null
-
-#include "JournalingObjectStore.h"
-
-#include "config.h"
-
-#define dout(x) if (x <= g_conf.debug) *_dout << dbeginl << g_clock.now() << " journal "
-#define derr(x) if (x <= g_conf.debug) *_derr << dbeginl << g_clock.now() << " journal "
-
-int JournalingObjectStore::journal_replay()
-{
- if (!journal)
- return 0;
-
- int err = journal->open(super_epoch);
- if (err < 0) {
- dout(3) << "journal_replay open failed with" << err
- << " " << strerror(err) << dendl;
- delete journal;
- journal = 0;
- return err;
- }
-
- int count = 0;
- while (1) {
- bufferlist bl;
- epoch_t e;
- if (!journal->read_entry(bl, e)) {
- dout(3) << "journal_replay: end of journal, done." << dendl;
- break;
- }
-
- if (e < super_epoch) {
- dout(3) << "journal_replay: skipping old entry in epoch " << e << " < " << super_epoch << dendl;
- continue;
- }
- if (e == super_epoch+1) {
- super_epoch++;
- dout(3) << "journal_replay: jumped to next epoch " << super_epoch << dendl;
- }
- assert(e == super_epoch);
-
- dout(3) << "journal_replay: applying transaction in epoch " << e << dendl;
- Transaction t(bl);
- apply_transaction(t);
- count++;
- }
-
- // done reading, make writeable.
- journal->make_writeable();
-
- return count;
-}
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-#ifndef __JOURNALINGOBJECTSTORE_H
-#define __JOURNALINGOBJECTSTORE_H
-
-#include "ObjectStore.h"
-#include "Journal.h"
-
-class JournalingObjectStore : public ObjectStore {
-protected:
- epoch_t super_epoch;
- Journal *journal;
- Finisher finisher;
- map<version_t, list<Context*> > commit_waiters;
-
- void journal_start() {
- finisher.start();
- }
- void journal_stop() {
- finisher.stop();
- }
- int journal_replay();
-
- void commit_start() {
- super_epoch++;
- if (journal)
- journal->commit_epoch_start(super_epoch);
- }
- void commit_finish() {
- finisher.queue(commit_waiters[super_epoch-1]);
- if (journal)
- journal->commit_epoch_finish(super_epoch);
- }
-
- void queue_commit_waiter(Context *oncommit) {
- if (oncommit)
- commit_waiters[super_epoch].push_back(oncommit);
- }
-
- void journal_write(pobject_t oid, off_t off, size_t len, const bufferlist& bl, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.write(oid, off, len, bl);
- bufferlist tbl;
- t.encode(tbl);
- journal->submit_entry(super_epoch, tbl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_zero(pobject_t oid, off_t off, size_t len, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.zero(oid, off, len);
- bufferlist tbl;
- t.encode(tbl);
- journal->submit_entry(super_epoch, tbl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_remove(pobject_t oid, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.remove(oid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_truncate(pobject_t oid, off_t size, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.truncate(oid, size);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_clone(pobject_t from, pobject_t to, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.clone(from, to);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_setattr(pobject_t oid, const char *name, const void *value, size_t size, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.setattr(oid, name, value, size);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_setattrs(pobject_t oid, map<string,bufferptr>& attrset, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.setattrs(oid, attrset);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_rmattr(pobject_t oid, const char *name, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.rmattr(oid, name);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_create_collection(coll_t cid, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.create_collection(cid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_destroy_collection(coll_t cid, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.remove_collection(cid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_collection_add(coll_t cid, pobject_t oid, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.collection_add(cid, oid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_collection_remove(coll_t cid, pobject_t oid, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.collection_remove(cid, oid);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_collection_setattr(coll_t cid, const char *name, const void *value, size_t size, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.collection_setattr(cid, name, value, size);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_collection_setattrs(coll_t cid, map<string,bufferptr>& aset, Context *onsafe) {
- if (journal && journal->is_writeable()) {
- Transaction t;
- t.collection_setattrs(cid, aset);
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
- void journal_sync(Context *onsafe) {
- if (journal) {
- // journal empty transaction
- Transaction t;
- bufferlist bl;
- t.encode(bl);
- journal->submit_entry(super_epoch, bl, onsafe);
- } else
- queue_commit_waiter(onsafe);
- }
-
-public:
- JournalingObjectStore() : super_epoch(0), journal(0) { }
-
-};
-
-#endif
#include "OSD.h"
#include "OSDMap.h"
-#include "FakeStore.h"
-
+#include "os/FileStore.h"
#include "ebofs/Ebofs.h"
#ifdef USE_OSBDB
if (g_conf.ebofs)
return new Ebofs(dev);
- if (g_conf.fakestore)
- return new FakeStore(dev);
+ if (g_conf.filestore)
+ return new FileStore(dev);
if (S_ISDIR(st.st_mode))
- return new FakeStore(dev);
+ return new FileStore(dev);
else
return new Ebofs(dev);
}
-// <hack> force remount hack for performance testing FakeStore
+// <hack> force remount hack for performance testing FileStore
class C_Remount : public Context {
OSD *osd;
public:
#include "mon/MonMap.h"
-#include "ObjectStore.h"
+#include "os/ObjectStore.h"
#include "PG.h"
#include "common/DecayCounter.h"
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-
-#include "ObjectStore.h"
-
-#include "config.h"
-#include "common/Clock.h"
-
-#define dout(x) if (x < g_conf.debug) *_dout << dbeginl << g_clock.now() << " ager: "
-
-object_t ObjectStore::age_get_oid() {
- if (!age_free_oids.empty()) {
- object_t o = age_free_oids.front();
- age_free_oids.pop_front();
- return o;
- }
- return age_cur_oid++;
- }
-
- ssize_t ObjectStore::age_pick_size() {
- ssize_t max = file_size_distn.sample() * 1024;
- return max/2 + (rand() % 100) * max/200 + 1;
- }
-
- void ObjectStore::age_fill(float pc, utime_t until) {
- bufferptr bp(1024*1024);
- bp.zero();
- bufferlist bl;
- bl.push_back(bp);
- while (1) {
- if (g_clock.now() > until) break;
-
- struct statfs st;
- statfs(&st);
- float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
- if (a >= pc) {
- dout(10) << "age_fill at " << a << " / " << pc << " stopping" << dendl;
- break;
- }
-
- object_t oid = age_get_oid();
-
- int b = rand() % 10;
- age_objects[b].push_back(oid);
-
- ssize_t s = age_pick_size();
-
- dout(10) << "age_fill at " << a << " / " << pc << " creating " << hex << oid << dec << " sz " << s << dendl;
-
- off_t off = 0;
- while (s) {
- ssize_t t = MIN(s, 1024*1024);
- write(oid, t, off, bl, false);
- off += t;
- s -= t;
- }
- oid++;
- }
- }
-
- void ObjectStore::age_empty(float pc) {
- int nper = 20;
- int n = nper;
- while (1) {
- struct statfs st;
- statfs(&st);
- float a = (float)(st.f_blocks-st.f_bavail) / (float)st.f_blocks;
- if (a <= pc) {
- dout(10) << "age_empty at " << a << " / " << pc << " stopping" << dendl;
- break;
- }
-
- int b = rand() % 10;
- n--;
- if (n == 0 || age_objects[b].empty()) {
- dout(10) << "age_empty sync" << dendl;
- //sync();
- sync();
- n = nper;
- continue;
- }
- object_t oid = age_objects[b].front();
- age_objects[b].pop_front();
-
- dout(10) << "age_empty at " << a << " / " << pc << " removing " << hex << oid << dec << dendl;
-
- remove(oid);
- age_free_oids.push_back(oid);
- }
- }
-
-
- void ObjectStore::age(int time,
- float high_water, // fill to this %
- float low_water, // then empty to this %
- int count, // this many times
- float final_water, // and end here ( <= low_water)
- int fake_size_mb) {
- utime_t until = g_clock.now();
- until.sec_ref() += time;
-
- while (age_objects.size() < 10) age_objects.push_back( list<object_t>() );
-
- if (fake_size_mb) {
- int fake_bl = fake_size_mb * 256;
- struct statfs st;
- statfs(&st);
- float f = (float)fake_bl / (float)st.f_blocks;
- high_water = (float)high_water * f;
- low_water = (float)low_water * f;
- final_water = (float)final_water * f;
- dout(10) << "fake " << fake_bl << " / " << st.f_blocks << " is " << f << ", high " << high_water << " low " << low_water << " final " << final_water << dendl;
- }
-
- // init size distn (once)
- if (!did_distn) {
- did_distn = true;
- age_cur_oid = 1;
- file_size_distn.add(1, 19.0758125+0.65434375);
- file_size_distn.add(512, 35.6566);
- file_size_distn.add(1024, 27.7271875);
- file_size_distn.add(2*1024, 16.63503125);
- //file_size_distn.add(4*1024, 106.82384375);
- //file_size_distn.add(8*1024, 81.493375);
- //file_size_distn.add(16*1024, 14.13553125);
- //file_size_distn.add(32*1024, 2.176);
- //file_size_distn.add(256*1024, 0.655938);
- //file_size_distn.add(512*1024, 0.1480625);
- //file_size_distn.add(1*1024*1024, 0.020125); // actually 2, but 32bit
- file_size_distn.normalize();
- }
-
- // clear
- for (int i=0; i<10; i++)
- age_objects[i].clear();
-
- for (int c=1; c<=count; c++) {
- if (g_clock.now() > until) break;
-
- dout(1) << "age " << c << "/" << count << " filling to " << high_water << dendl;
- age_fill(high_water, until);
- if (c == count) {
- dout(1) << "age final empty to " << final_water << dendl;
- age_empty(final_water);
- } else {
- dout(1) << "age " << c << "/" << count << " emptying to " << low_water << dendl;
- age_empty(low_water);
- }
- }
- dout(1) << "age finished" << dendl;
- }
-
+++ /dev/null
-// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*-
-// vim: ts=8 sw=2 smarttab
-/*
- * Ceph - scalable distributed file system
- *
- * Copyright (C) 2004-2006 Sage Weil <sage@newdream.net>
- *
- * This is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Lesser General Public
- * License version 2.1, as published by the Free Software
- * Foundation. See file COPYING.
- *
- */
-
-
-#ifndef __OBJECTSTORE_H
-#define __OBJECTSTORE_H
-
-#include "include/types.h"
-#include "osd_types.h"
-#include "include/Context.h"
-#include "include/buffer.h"
-#include "include/pobject.h"
-
-#include "include/Distribution.h"
-
-#include <sys/stat.h>
-
-#ifdef DARWIN
-#include <sys/statvfs.h>
-#else
-#include <sys/vfs.h> /* or <sys/statfs.h> */
-#endif /* DARWIN */
-
-#include <list>
-using std::list;
-
-#ifndef MIN
-# define MIN(a,b) ((a) < (b) ? (a):(b))
-#endif
-
-/*
- * low-level interface to the local OSD file system
- */
-
-
-
-class ObjectStore {
-public:
-
-
- class FragmentationStat {
- public:
- int total;
- int num_extent;
- int avg_extent;
- map<int,int> extent_dist; // powers of two
- map<int,int> extent_dist_sum; // powers of two
-
- float avg_extent_per_object;
- int avg_extent_jump; // avg distance bweteen consecutive extents
-
- int total_free;
- int num_free_extent;
- int avg_free_extent;
- map<int,int> free_extent_dist; // powers of two
- map<int,int> free_extent_dist_sum; // powers of two
- };
-
-
-
- /*********************************
- * transaction
- */
- class Transaction {
- public:
- static const int OP_READ = 1; // oid, offset, len, pbl
- static const int OP_STAT = 2; // oid, pstat
- static const int OP_GETATTR = 3; // oid, attrname, pattrval
- static const int OP_GETATTRS = 4; // oid, pattrset
-
- static const int OP_WRITE = 10; // oid, offset, len, bl
- static const int OP_ZERO = 11; // oid, offset, len
- static const int OP_TRUNCATE = 12; // oid, len
- static const int OP_REMOVE = 13; // oid
- static const int OP_SETATTR = 14; // oid, attrname, attrval
- static const int OP_SETATTRS = 15; // oid, attrset
- static const int OP_RMATTR = 16; // oid, attrname
- static const int OP_CLONE = 17; // oid, newoid
-
- static const int OP_TRIMCACHE = 18; // oid, offset, len
-
- static const int OP_MKCOLL = 20; // cid
- static const int OP_RMCOLL = 21; // cid
- static const int OP_COLL_ADD = 22; // cid, oid
- static const int OP_COLL_REMOVE = 23; // cid, oid
- static const int OP_COLL_SETATTR = 24; // cid, attrname, attrval
- static const int OP_COLL_RMATTR = 25; // cid, attrname
- static const int OP_COLL_SETATTRS = 26; // cid, attrset
-
- private:
- list<int8_t> ops;
- list<bufferlist> bls;
- list<pobject_t> oids;
- list<coll_t> cids;
- list<int64_t> lengths;
- list<const char*> attrnames;
- list<string> attrnames2;
-
- // for reads only (not encoded)
- list<bufferlist*> pbls;
- list<struct stat*> psts;
- list< pair<void*,int*> > pattrvals;
- list< map<string,bufferptr>* > pattrsets;
-
- public:
- bool have_op() {
- return !ops.empty();
- }
- int get_num_ops() { return ops.size(); }
- int get_op() {
- int op = ops.front();
- ops.pop_front();
- return op;
- }
- void get_bl(bufferlist& bl) {
- bl.claim(bls.front());
- bls.pop_front();
- }
- void get_oid(pobject_t& oid) {
- oid = oids.front();
- oids.pop_front();
- }
- void get_cid(coll_t& cid) {
- cid = cids.front();
- cids.pop_front();
- }
- void get_length(off_t& len) {
- len = lengths.front();
- lengths.pop_front();
- }
- void get_attrname(const char * &p) {
- p = attrnames.front();
- attrnames.pop_front();
- }
- void get_pbl(bufferlist* &pbl) {
- pbl = pbls.front();
- pbls.pop_front();
- }
- void get_pstat(struct stat* &pst) {
- pst = psts.front();
- psts.pop_front();
- }
- void get_pattrval(pair<void*,int*>& p) {
- p = pattrvals.front();
- pattrvals.pop_front();
- }
- void get_pattrset(map<string,bufferptr>* &ps) {
- ps = pattrsets.front();
- pattrsets.pop_front();
- }
-
-
- void read(pobject_t oid, off_t off, size_t len, bufferlist *pbl) {
- int op = OP_READ;
- ops.push_back(op);
- oids.push_back(oid);
- lengths.push_back(off);
- lengths.push_back(len);
- pbls.push_back(pbl);
- }
- void stat(pobject_t oid, struct stat *st) {
- int op = OP_STAT;
- ops.push_back(op);
- oids.push_back(oid);
- psts.push_back(st);
- }
- void getattr(pobject_t oid, const char* name, void* val, int *plen) {
- int op = OP_GETATTR;
- ops.push_back(op);
- oids.push_back(oid);
- attrnames.push_back(name);
- pattrvals.push_back(pair<void*,int*>(val,plen));
- }
- void getattrs(pobject_t oid, map<string,bufferptr>& aset) {
- int op = OP_GETATTRS;
- ops.push_back(op);
- oids.push_back(oid);
- pattrsets.push_back(&aset);
- }
-
- void write(pobject_t oid, off_t off, size_t len, const bufferlist& bl) {
- int op = OP_WRITE;
- ops.push_back(op);
- oids.push_back(oid);
- lengths.push_back(off);
- lengths.push_back(len);
- bls.push_back(bl);
- }
- void zero(pobject_t oid, off_t off, size_t len) {
- int op = OP_ZERO;
- ops.push_back(op);
- oids.push_back(oid);
- lengths.push_back(off);
- lengths.push_back(len);
- }
- void trim_from_cache(pobject_t oid, off_t off, size_t len) {
- int op = OP_TRIMCACHE;
- ops.push_back(op);
- oids.push_back(oid);
- lengths.push_back(off);
- lengths.push_back(len);
- }
- void truncate(pobject_t oid, off_t off) {
- int op = OP_TRUNCATE;
- ops.push_back(op);
- oids.push_back(oid);
- lengths.push_back(off);
- }
- void remove(pobject_t oid) {
- int op = OP_REMOVE;
- ops.push_back(op);
- oids.push_back(oid);
- }
- void setattr(pobject_t oid, const char* name, const void* val, int len) {
- int op = OP_SETATTR;
- ops.push_back(op);
- oids.push_back(oid);
- attrnames.push_back(name);
- //attrvals.push_back(pair<const void*,int>(val,len));
- bufferlist bl;
- bl.append((char*)val,len);
- bls.push_back(bl);
- }
- void setattrs(pobject_t oid, map<string,bufferptr>& attrset) {
- int op = OP_SETATTRS;
- ops.push_back(op);
- oids.push_back(oid);
- pattrsets.push_back(&attrset);
- }
- void rmattr(pobject_t oid, const char* name) {
- int op = OP_RMATTR;
- ops.push_back(op);
- oids.push_back(oid);
- attrnames.push_back(name);
- }
- void clone(pobject_t oid, pobject_t noid) {
- int op = OP_CLONE;
- ops.push_back(op);
- oids.push_back(oid);
- oids.push_back(noid);
- }
- void create_collection(coll_t cid) {
- int op = OP_MKCOLL;
- ops.push_back(op);
- cids.push_back(cid);
- }
- void remove_collection(coll_t cid) {
- int op = OP_RMCOLL;
- ops.push_back(op);
- cids.push_back(cid);
- }
- void collection_add(coll_t cid, pobject_t oid) {
- int op = OP_COLL_ADD;
- ops.push_back(op);
- cids.push_back(cid);
- oids.push_back(oid);
- }
- void collection_remove(coll_t cid, pobject_t oid) {
- int op = OP_COLL_REMOVE;
- ops.push_back(op);
- cids.push_back(cid);
- oids.push_back(oid);
- }
- void collection_setattr(coll_t cid, const char* name, const void* val, int len) {
- int op = OP_COLL_SETATTR;
- ops.push_back(op);
- cids.push_back(cid);
- attrnames.push_back(name);
- bufferlist bl;
- bl.append((char*)val, len);
- bls.push_back(bl);
- }
- void collection_rmattr(coll_t cid, const char* name) {
- int op = OP_COLL_RMATTR;
- ops.push_back(op);
- cids.push_back(cid);
- attrnames.push_back(name);
- }
- void collection_setattrs(coll_t cid, map<string,bufferptr>& aset) {
- int op = OP_COLL_SETATTRS;
- ops.push_back(op);
- cids.push_back(cid);
- pattrsets.push_back(&aset);
- }
-
- // etc.
- Transaction() {}
- Transaction(bufferlist::iterator &p) { decode(p); }
- Transaction(bufferlist &bl) {
- bufferlist::iterator p = bl.begin();
- decode(p);
- }
-
- void encode(bufferlist& bl) const {
- ::encode(ops, bl);
- ::encode(bls, bl);
- ::encode(oids, bl);
- ::encode(cids, bl);
- ::encode(lengths, bl);
- ::encode(attrnames, bl);
- }
- void decode(bufferlist::iterator &bl) {
- ::decode(ops, bl);
- ::decode(bls, bl);
- ::decode(oids, bl);
- ::decode(cids, bl);
- ::decode(lengths, bl);
- ::decode(attrnames2, bl);
- for (list<string>::iterator p = attrnames2.begin();
- p != attrnames2.end();
- ++p)
- attrnames.push_back((*p).c_str());
- }
- };
-
- /*
- * these stubs should be implemented if we want to use the
- * apply_transaction() below and we want atomic transactions.
- */
- virtual int transaction_start() { return 0; }
- virtual void transaction_end(int id) { }
- virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) {
- // non-atomic implementation
- int id = transaction_start();
- while (t.have_op()) {
- int op = t.get_op();
- switch (op) {
- case Transaction::OP_READ:
- {
- pobject_t oid;
- off_t offset, len;
- t.get_oid(oid);
- t.get_length(offset);
- t.get_length(len);
- bufferlist *pbl;
- t.get_pbl(pbl);
- read(oid, offset, len, *pbl);
- }
- break;
- case Transaction::OP_STAT:
- {
- pobject_t oid;
- t.get_oid(oid);
- struct stat *st;
- t.get_pstat(st);
- stat(oid, st);
- }
- break;
- case Transaction::OP_GETATTR:
- {
- pobject_t oid;
- t.get_oid(oid);
- const char *attrname;
- t.get_attrname(attrname);
- pair<void*,int*> pattrval;
- t.get_pattrval(pattrval);
- *pattrval.second = getattr(oid, attrname, pattrval.first, *pattrval.second);
- }
- break;
- case Transaction::OP_GETATTRS:
- {
- pobject_t oid;
- t.get_oid(oid);
- map<string,bufferptr> *pset;
- t.get_pattrset(pset);
- getattrs(oid, *pset);
- }
- break;
-
- case Transaction::OP_WRITE:
- {
- pobject_t oid;
- t.get_oid(oid);
- off_t offset, len;
- t.get_length(offset);
- t.get_length(len);
- bufferlist bl;
- t.get_bl(bl);
- write(oid, offset, len, bl, 0);
- }
- break;
-
- case Transaction::OP_ZERO:
- {
- pobject_t oid;
- t.get_oid(oid);
- off_t offset, len;
- t.get_length(offset);
- t.get_length(len);
- zero(oid, offset, len, 0);
- }
- break;
-
- case Transaction::OP_TRIMCACHE:
- {
- pobject_t oid;
- t.get_oid(oid);
- off_t offset, len;
- t.get_length(offset);
- t.get_length(len);
- trim_from_cache(oid, offset, len);
- }
- break;
-
- case Transaction::OP_TRUNCATE:
- {
- pobject_t oid;
- t.get_oid(oid);
- off_t len;
- t.get_length(len);
- truncate(oid, len, 0);
- }
- break;
-
- case Transaction::OP_REMOVE:
- {
- pobject_t oid;
- t.get_oid(oid);
- remove(oid, 0);
- }
- break;
-
- case Transaction::OP_SETATTR:
- {
- pobject_t oid;
- t.get_oid(oid);
- const char *attrname;
- t.get_attrname(attrname);
- bufferlist bl;
- t.get_bl(bl);
- setattr(oid, attrname, bl.c_str(), bl.length(), 0);
- }
- break;
- case Transaction::OP_SETATTRS:
- {
- pobject_t oid;
- t.get_oid(oid);
- map<string,bufferptr> *pattrset;
- t.get_pattrset(pattrset);
- setattrs(oid, *pattrset, 0);
- }
- break;
-
- case Transaction::OP_RMATTR:
- {
- pobject_t oid;
- t.get_oid(oid);
- const char *attrname;
- t.get_attrname(attrname);
- rmattr(oid, attrname, 0);
- }
- break;
-
- case Transaction::OP_CLONE:
- {
- pobject_t oid;
- t.get_oid(oid);
- pobject_t noid;
- t.get_oid(noid);
- clone(oid, noid);
- }
- break;
-
- case Transaction::OP_MKCOLL:
- {
- coll_t cid;
- t.get_cid(cid);
- create_collection(cid, 0);
- }
- break;
-
- case Transaction::OP_RMCOLL:
- {
- coll_t cid;
- t.get_cid(cid);
- destroy_collection(cid, 0);
- }
- break;
-
- case Transaction::OP_COLL_ADD:
- {
- coll_t cid;
- t.get_cid(cid);
- pobject_t oid;
- t.get_oid(oid);
- collection_add(cid, oid, 0);
- }
- break;
-
- case Transaction::OP_COLL_REMOVE:
- {
- coll_t cid;
- t.get_cid(cid);
- pobject_t oid;
- t.get_oid(oid);
- collection_remove(cid, oid, 0);
- }
- break;
-
- case Transaction::OP_COLL_SETATTR:
- {
- coll_t cid;
- t.get_cid(cid);
- const char *attrname;
- t.get_attrname(attrname);
- bufferlist bl;
- t.get_bl(bl);
- collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0);
- }
- break;
-
- case Transaction::OP_COLL_RMATTR:
- {
- coll_t cid;
- t.get_cid(cid);
- const char *attrname;
- t.get_attrname(attrname);
- collection_rmattr(cid, attrname, 0);
- }
- break;
-
-
- default:
- cerr << "bad op " << op << std::endl;
- assert(0);
- }
- }
- transaction_end(id);
-
- if (onsafe) sync(onsafe);
-
- return 0; // FIXME count errors
- }
-
- /*********************************************/
-
-
-
- public:
- ObjectStore() {}
- virtual ~ObjectStore() {}
-
- // mgmt
- virtual int mount() = 0;
- virtual int umount() = 0;
- virtual int mkfs() = 0; // wipe
-
- virtual int statfs(struct statfs *buf) = 0;
-
- // objects
- virtual int pick_object_revision_lt(pobject_t& oid) = 0;
-
- virtual bool exists(pobject_t oid) = 0; // useful?
- virtual int stat(pobject_t oid, struct stat *st) = 0; // struct stat?
-
- virtual int remove(pobject_t oid,
- Context *onsafe=0) = 0;
-
- virtual int truncate(pobject_t oid, off_t size,
- Context *onsafe=0) = 0;
-
- virtual int read(pobject_t oid,
- off_t offset, size_t len,
- bufferlist& bl) = 0;
- virtual int write(pobject_t oid,
- off_t offset, size_t len,
- const bufferlist& bl,
- Context *onsafe) = 0;//{ return -1; }
- virtual int zero(pobject_t oid,
- off_t offset, size_t len,
- Context *onsafe) {
- // write zeros.. yuck!
- bufferptr bp(len);
- bufferlist bl;
- bl.push_back(bp);
- return write(oid, offset, len, bl, onsafe);
- }
- virtual void trim_from_cache(pobject_t oid,
- off_t offset, size_t len) { }
- virtual int is_cached(pobject_t oid,
- off_t offset,
- size_t len) { return -1; }
-
- virtual int setattr(pobject_t oid, const char *name,
- const void *value, size_t size,
- Context *onsafe=0) {return 0;} //= 0;
- virtual int setattrs(pobject_t oid, map<string,bufferptr>& aset,
- Context *onsafe=0) {return 0;} //= 0;
- virtual int getattr(pobject_t oid, const char *name,
- void *value, size_t size) {return 0;} //= 0;
- virtual int getattrs(pobject_t oid, map<string,bufferptr>& aset) {return 0;};
-
- virtual int rmattr(pobject_t oid, const char *name,
- Context *onsafe=0) {return 0;}
-
- virtual int clone(pobject_t oid, pobject_t noid) {
- return -1;
- }
-
- virtual int list_objects(list<pobject_t>& ls) = 0;//{ return -1; }
-
- virtual int get_object_collections(pobject_t oid, set<coll_t>& ls) { return -1; }
-
- //virtual int listattr(pobject_t oid, char *attrs, size_t size) {return 0;} //= 0;
-
- // collections
- virtual int list_collections(list<coll_t>& ls) {return 0;}//= 0;
- virtual int create_collection(coll_t c,
- Context *onsafe=0) {return 0;}//= 0;
- virtual int destroy_collection(coll_t c,
- Context *onsafe=0) {return 0;}//= 0;
- virtual bool collection_exists(coll_t c) {return 0;}
- virtual int collection_stat(coll_t c, struct stat *st) {return 0;}//= 0;
- virtual int collection_add(coll_t c, pobject_t o,
- Context *onsafe=0) {return 0;}//= 0;
- virtual int collection_remove(coll_t c, pobject_t o,
- Context *onsafe=0) {return 0;}// = 0;
- virtual int collection_list(coll_t c, list<pobject_t>& o) {return 0;}//= 0;
-
- virtual int collection_setattr(coll_t cid, const char *name,
- const void *value, size_t size,
- Context *onsafe=0) {return 0;} //= 0;
- virtual int collection_rmattr(coll_t cid, const char *name,
- Context *onsafe=0) {return 0;} //= 0;
- virtual int collection_getattr(coll_t cid, const char *name,
- void *value, size_t size) {return 0;} //= 0;
-
- virtual int collection_getattrs(coll_t cid, map<string,bufferptr> &aset) = 0;//{ return -1; }
- virtual int collection_setattrs(coll_t cid, map<string,bufferptr> &aset) = 0;//{ return -1; }
-
-
- //virtual int collection_listattr(coll_t cid, char *attrs, size_t size) {return 0;} //= 0;
-
- virtual void sync(Context *onsync) {}
- virtual void sync() {}
-
-
- virtual void _fake_writes(bool b) {};
-
- virtual void _get_frag_stat(FragmentationStat& st) {};
-
-};
-
-
-#endif
#include "include/buffer.h"
#include "OSDMap.h"
-#include "ObjectStore.h"
+#include "os/ObjectStore.h"
#include "msg/Messenger.h"
#include "common/DecayCounter.h"
-// osd types
-typedef uint64_t coll_t; // collection id
// pg stuff
#include <iostream>
#include "ebofs/Ebofs.h"
-#include "osd/FakeStore.h"
+#include "os/FileStore.h"
struct io {
utime_t start, ack, commit;
<< seconds << " seconds, " << bytes << " bytes per write" << std::endl;
//ObjectStore *fs = new Ebofs(filename, journal);
- ObjectStore *fs = new FakeStore(filename);
+ ObjectStore *fs = new FileStore(filename);
if (g_conf.mkfs &&
fs->mkfs() < 0) {
ARGS="-d"
# start monitor
-$CEPH_BIN/cmon $ARGS mondata/mon0 --debug_mon 20 --debug_ms 1
+#valgrind --tool=massif
+#valgrind --leak-check=full --show-reachable=yes $CEPH_BIN/cmon mondata/mon0 --debug_mon 20 --debug_ms 1 > out/mon0 &
+#valgrind --tool=massif $CEPH_BIN/cmon mondata/mon0 --debug_mon 20 --debug_ms 1 > out/mon0 &
+#sleep 1
+$CEPH_BIN/cmon -d mondata/mon0 --debug_mon 20 --debug_ms 1
# build and inject an initial osd map
-$CEPH_BIN/osdmaptool --clobber --createsimple .ceph_monmap 8 --print .ceph_osdmap --pgbits 2
+$CEPH_BIN/osdmaptool --clobber --createsimple .ceph_monmap 4 --print .ceph_osdmap # --pgbits 2
$CEPH_BIN/cmonctl osd setmap -i .ceph_osdmap
-for osd in 0 1 2 3 #4 5 6 7
+for osd in 0 1 2 3 #4 5 6 7 8 9 10 11 12 13 14 15
do
$CEPH_BIN/cosd --mkfs_for_osd $osd dev/osd$osd # initialize empty object store
- #valgrind --tool=massif $CEPH_BIN/cosd dev/osd$osd --debug_ms 1 --debug_osd 20 --debug_fakestore 10 1>out/o$osd & #--debug_osd 40
- $CEPH_BIN/cosd dev/osd$osd -d --debug_ms 1 --debug_osd 20 --debug_fakestore 10
+ #valgrind --tool=massif $CEPH_BIN/cosd dev/osd$osd --debug_ms 1 --debug_osd 20 --debug_filestore 10 1>out/o$osd & #--debug_osd 40
+ $CEPH_BIN/cosd dev/osd$osd -d --debug_ms 1 --debug_osd 20 --debug_filestore 10
done
# mds