From: Sage Weil Date: Thu, 5 Jun 2008 23:58:43 +0000 (-0700) Subject: filestore: alternative btrfs usertrans ioctl X-Git-Tag: v0.3~139 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=bfe3ca572a36e9a91207f3f858bbfe91982c5465;p=ceph.git filestore: alternative btrfs usertrans ioctl --- diff --git a/src/TODO b/src/TODO index 7f9dd685128..d1231af1eed 100644 --- a/src/TODO +++ b/src/TODO @@ -130,6 +130,8 @@ rados snapshots objecter +- fix failure handler... + - generic mon client? - maybe_request_map should set a timer event to periodically re-request. - transaction prepare/commit? - read+floor_lockout diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index ed832c2e0fd..dbe84670d5b 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -50,6 +50,34 @@ # define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) # define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) # define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) + +// alternate usertrans interface... +#define BTRFS_IOC_USERTRANS_OPEN 1 +#define BTRFS_IOC_USERTRANS_CLOSE 2 +#define BTRFS_IOC_USERTRANS_SEEK 3 +#define BTRFS_IOC_USERTRANS_WRITE 5 +#define BTRFS_IOC_USERTRANS_UNLINK 6 +#define BTRFS_IOC_USERTRANS_MKDIR 7 +#define BTRFS_IOC_USERTRANS_RMDIR 8 +#define BTRFS_IOC_USERTRANS_TRUNCATE 9 +#define BTRFS_IOC_USERTRANS_SETXATTR 10 +#define BTRFS_IOC_USERTRANS_REMOVEXATTR 11 +#define BTRFS_IOC_USERTRANS_CLONE 12 + +struct btrfs_ioctl_usertrans_op { + __u64 op; + __s64 args[5]; + __s64 rval; +}; + +struct btrfs_ioctl_usertrans { + __u64 len; + struct btrfs_ioctl_usertrans_op ops[0]; +}; + +#define BTRFS_IOC_USERTRANS _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_usertrans) + #endif #endif @@ -282,26 +310,30 @@ int FileStore::mount() char fn[100]; int fd; -#ifdef BTRFS_IOC_SYNC + // get fsid + sprintf(fn, "%s/fsid", basedir.c_str()); + lock_fd = ::open(fn, O_RDONLY); + ::read(lock_fd, &fsid, sizeof(fsid)); + + // and lock it.. + // FIXME + + dout(10) << "mount fsid is " << fsid << dendl; + +#ifdef BTRFS_IOC_USERTRANS // is this btrfs? - btrfs_fd = ::open(basedir.c_str(), O_DIRECTORY); - r = ::ioctl(btrfs_fd, BTRFS_IOC_SYNC); + Transaction empty; + btrfs = true; + btrfs_trans_start_end = true; // trans start/end interface + r = apply_transaction(empty, 0); if (r == 0) { dout(0) << "mount detected btrfs" << dendl; } else { dout(0) << "mount did NOT detect btrfs: " << strerror(-r) << dendl; - ::close(btrfs_fd); - btrfs_fd = -1; + btrfs = false; } #endif - - // get fsid - sprintf(fn, "%s/fsid", basedir.c_str()); - fd = ::open(fn, O_RDONLY); - ::read(fd, &fsid, sizeof(fsid)); - ::close(fd); - dout(10) << "mount fsid is " << fsid << dendl; - + // get epoch sprintf(fn, "%s/commit_epoch", basedir.c_str()); fd = ::open(fn, O_RDONLY); @@ -342,10 +374,7 @@ int FileStore::umount() lock.Unlock(); sync_thread.join(); - if (btrfs_fd >= 0) { - ::close(btrfs_fd); - btrfs_fd = -1; - } + ::close(lock_fd); if (g_conf.filestore_dev) { char cmd[100]; @@ -358,19 +387,22 @@ int FileStore::umount() return 0; } +// btrfs transaction start/end interface -int FileStore::transaction_start() +int FileStore::transaction_start(int len) { - if (btrfs_fd < 0) + if (!btrfs || !btrfs_trans_start_end) return 0; int fd = ::open(basedir.c_str(), O_RDONLY); - if (fd < 0) + if (fd < 0) { derr(0) << "transaction_start got " << strerror(errno) - << " from btrfs open" << dendl; + << " from btrfs open" << dendl; + assert(0); + } if (::ioctl(fd, BTRFS_IOC_TRANS_START) < 0) { derr(0) << "transaction_start got " << strerror(errno) - << " from btrfs ioctl" << dendl; + << " from btrfs ioctl" << dendl; ::close(fd); return -errno; } @@ -380,12 +412,449 @@ int FileStore::transaction_start() void FileStore::transaction_end(int fd) { - if (btrfs_fd < 0) + if (!btrfs || !btrfs_trans_start_end) return; dout(10) << "transaction_end " << fd << dendl; ::close(fd); } +unsigned FileStore::apply_transaction(Transaction &t, Context *onsafe) +{ + // no btrfs transaction support? + // or, use trans start/end ioctls? + if (!btrfs || btrfs_trans_start_end) + return ObjectStore::apply_transaction(t, onsafe); + + // create transaction + int len = t.get_len() * 30; // very conservative! FIXME FIXME FIXME + dout(20) << "apply_transaction allocation btrfs usertrans len " << len << dendl; + btrfs_ioctl_usertrans *trans = + (btrfs_ioctl_usertrans *)new char[sizeof(*trans) + len * sizeof(trans->ops[0])]; + + trans->len = 0; + + list str; + + while (t.have_op()) { + int op = t.get_op(); + + switch (op) { + case Transaction::OP_READ: + { + coll_t cid; + pobject_t oid; + __u64 offset, len; + t.get_cid(cid); + t.get_oid(oid); + t.get_length(offset); + t.get_length(len); + bufferlist *pbl; + t.get_pbl(pbl); + read(cid, oid, offset, len, *pbl); + } + break; + case Transaction::OP_STAT: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + struct stat *st; + t.get_pstat(st); + stat(cid, oid, st); + } + break; + case Transaction::OP_GETATTR: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); + pair pattrval; + t.get_pattrval(pattrval); + *pattrval.second = getattr(cid, oid, attrname, pattrval.first, *pattrval.second); + } + break; + case Transaction::OP_GETATTRS: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + map *pset; + t.get_pattrset(pset); + getattrs(cid, oid, *pset); + } + break; + + case Transaction::OP_WRITE: + case Transaction::OP_ZERO: // write actual zeros. + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + __u64 offset, len; + t.get_length(offset); + t.get_length(len); + bufferlist bl; + if (op == Transaction::OP_WRITE) + t.get_bl(bl); + else { + bufferptr bp(len); + bp.zero(); + bl.push_back(bp); + } + + dout(10) << "write" << dendl; + //write(cid, oid, offset, len, bl, 0); + char *fn = new char[80]; + str.push_back(fn); + get_coname(cid, oid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_OPEN; + trans->ops[trans->len].args[0] = (__s64)fn; + trans->ops[trans->len].args[1] = O_WRONLY|O_CREAT; + trans->len++; + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_SEEK; + trans->ops[trans->len].args[0] = -1; + trans->ops[trans->len].args[1] = offset; + trans->ops[trans->len].args[2] = (__s64)&trans->ops[trans->len].args[4]; // whatever. + trans->ops[trans->len].args[3] = SEEK_SET; + trans->len++; + for (list::const_iterator it = bl.buffers().begin(); + it != bl.buffers().end(); + it++) { + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_WRITE; + trans->ops[trans->len].args[0] = -1; + trans->ops[trans->len].args[1] = (__s64)(*it).c_str(); + trans->ops[trans->len].args[2] = (__s64)(*it).length(); + trans->len++; + } + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_CLOSE; + trans->ops[trans->len].args[0] = -1; + trans->len++; + } + break; + + case Transaction::OP_TRIMCACHE: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + __u64 offset, len; + t.get_length(offset); + t.get_length(len); + trim_from_cache(cid, oid, offset, len); + } + break; + + case Transaction::OP_TRUNCATE: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + __u64 len; + t.get_length(len); + //truncate(cid, oid, len, 0); + + dout(10) << "truncate" << dendl; + char *fn = new char[80]; + str.push_back(fn); + get_coname(cid, oid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_TRUNCATE; + trans->ops[trans->len].args[0] = (__s64)fn; + trans->ops[trans->len].args[1] = len; + trans->len++; + } + break; + + case Transaction::OP_REMOVE: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + //remove(cid, oid, 0); + + dout(10) << "remove " << cid << " " << oid << dendl; + char *fn = new char[80]; + str.push_back(fn); + get_coname(cid, oid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_UNLINK; + trans->ops[trans->len].args[0] = (__u64)fn; + trans->len++; + } + break; + + case Transaction::OP_SETATTR: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); + bufferlist bl; + t.get_bl(bl); + //setattr(cid, oid, attrname, bl.c_str(), bl.length(), 0); + dout(10) << "setattr " << cid << " " << oid << dendl; + char *fn = new char[80]; + str.push_back(fn); + get_coname(cid, oid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_SETXATTR; + trans->ops[trans->len].args[0] = (__u64)fn; + char aname[40]; + sprintf(aname, "user.ceph.%s", attrname); + trans->ops[trans->len].args[1] = (__u64)aname; + trans->ops[trans->len].args[2] = (__u64)bl.c_str(); + trans->ops[trans->len].args[3] = bl.length(); + trans->ops[trans->len].args[4] = 0; + trans->len++; + } + break; + case Transaction::OP_SETATTRS: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + map *pattrset; + t.get_pattrset(pattrset); + //setattrs(cid, oid, *pattrset, 0); + + // make note of old attrs + map oldattrs; + getattrs(cid, oid, oldattrs); + + dout(10) << "setattrs " << cid << " " << oid << dendl; + char *fn = new char[80]; + str.push_back(fn); + get_coname(cid, oid, fn); + for (map::iterator p = pattrset->begin(); + p != pattrset->end(); + p++) { + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_SETXATTR; + trans->ops[trans->len].args[0] = (__u64)fn; + char *aname = new char[40]; + str.push_back(aname); + sprintf(aname, "user.ceph.%s", p->first.c_str()); + trans->ops[trans->len].args[1] = (__u64)aname; + trans->ops[trans->len].args[2] = (__u64)p->second.c_str(); + trans->ops[trans->len].args[3] = p->second.length(); + trans->ops[trans->len].args[4] = 0; + trans->len++; + oldattrs.erase(p->first); + } + + // and remove any leftovers + for (map::iterator p = oldattrs.begin(); + p != oldattrs.end(); + p++) { + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_REMOVEXATTR; + trans->ops[trans->len].args[0] = (__u64)fn; + trans->ops[trans->len].args[1] = (__u64)p->first.c_str(); + trans->len++; + } + } + break; + + case Transaction::OP_RMATTR: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + const char *attrname; + t.get_attrname(attrname); + //rmattr(cid, oid, attrname, 0); + + dout(10) << "rmattr " << cid << " " << oid << dendl; + char *fn = new char[80]; + str.push_back(fn); + get_coname(cid, oid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_REMOVEXATTR; + trans->ops[trans->len].args[0] = (__u64)fn; + trans->ops[trans->len].args[1] = (__u64)attrname; + trans->len++; + } + break; + + case Transaction::OP_CLONE: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + pobject_t noid; + t.get_oid(noid); + //clone(cid, oid, noid); + + dout(10) << "clone " << cid << " " << oid << dendl; + char *ofn = new char[80]; + str.push_back(ofn); + char *nfn = new char[80]; + str.push_back(nfn); + get_coname(cid, oid, ofn); + get_coname(cid, noid, nfn); + + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_OPEN; + trans->ops[trans->len].args[0] = (__u64)nfn; + trans->ops[trans->len].args[1] = O_WRONLY; + trans->len++; + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_OPEN; + trans->ops[trans->len].args[0] = (__u64)ofn; + trans->ops[trans->len].args[1] = O_RDONLY; + trans->len++; + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_CLONE; + trans->ops[trans->len].args[0] = -2; + trans->ops[trans->len].args[1] = -1; + trans->len++; + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_CLONE; + trans->ops[trans->len].args[0] = -1; + trans->len++; + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_CLONE; + trans->ops[trans->len].args[0] = -2; + trans->len++; + } + break; + + case Transaction::OP_MKCOLL: + { + coll_t cid; + t.get_cid(cid); + //create_collection(cid, 0); + dout(10) << "mkcoll " << cid << dendl; + char *fn = new char[80]; + str.push_back(fn); + get_cdir(cid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_MKDIR; + trans->ops[trans->len].args[0] = (__u64)fn; + trans->ops[trans->len].args[1] = 0644; + trans->len++; + } + break; + + case Transaction::OP_RMCOLL: + { + coll_t cid; + t.get_cid(cid); + //destroy_collection(cid, 0); + dout(10) << "rmcoll " << cid << dendl; + char *fn = new char[80]; + str.push_back(fn); + get_cdir(cid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_RMDIR; + trans->ops[trans->len].args[0] = (__u64)fn; + trans->ops[trans->len].args[1] = 0644; + trans->len++; + } + break; + + case Transaction::OP_COLL_ADD: + { + coll_t cid, ocid; + t.get_cid(cid); + t.get_cid(ocid); + pobject_t oid; + t.get_oid(oid); + collection_add(cid, ocid, oid, 0); + assert(0); + } + break; + + case Transaction::OP_COLL_REMOVE: + { + coll_t cid; + t.get_cid(cid); + pobject_t oid; + t.get_oid(oid); + collection_remove(cid, oid, 0); + assert(0); + } + break; + + case Transaction::OP_COLL_SETATTR: + { + coll_t cid; + t.get_cid(cid); + const char *attrname; + t.get_attrname(attrname); + bufferlist bl; + t.get_bl(bl); + dout(10) << "coll_setattr " << cid << dendl; + //collection_setattr(cid, attrname, bl.c_str(), bl.length(), 0); + char *fn = new char[80]; + str.push_back(fn); + get_cdir(cid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_SETXATTR; + trans->ops[trans->len].args[0] = (__u64)fn; + char aname[40]; + sprintf(aname, "user.ceph.%s", attrname); + trans->ops[trans->len].args[1] = (__u64)aname; + trans->ops[trans->len].args[2] = (__u64)bl.c_str(); + trans->ops[trans->len].args[3] = bl.length(); + trans->ops[trans->len].args[4] = 0; + trans->len++; + } + break; + + case Transaction::OP_COLL_RMATTR: + { + coll_t cid; + t.get_cid(cid); + const char *attrname; + t.get_attrname(attrname); + dout(10) << "coll_rmattr " << cid << dendl; + //collection_rmattr(cid, attrname, 0); + char *fn = new char[80]; + str.push_back(fn); + get_cdir(cid, fn); + trans->ops[trans->len].op = BTRFS_IOC_USERTRANS_REMOVEXATTR; + trans->ops[trans->len].args[0] = (__u64)fn; + trans->ops[trans->len].args[1] = (__u64)attrname; + trans->len++; + } + break; + + + default: + cerr << "bad op " << op << std::endl; + assert(0); + } + } + + dout(20) << "apply_transaction final btrfs usertrans len is " << trans->len << dendl; + + // apply + int r = 0; + if (trans->len) { + r = ::ioctl(lock_fd, BTRFS_IOC_USERTRANS, (unsigned long)trans); + if (r < 0) { + derr(0) << "apply_transaction_end got " << strerror(errno) + << " from btrfs usertrans ioctl" << dendl; + r = -errno; + } + } + delete[] (char *)trans; + + while (!str.empty()) { + delete[] str.front(); + str.pop_front(); + } + + if (r >= 0) + journal_transaction(t, onsafe); + else + delete onsafe; + + return r; +} + + // -------------------- // objects @@ -537,7 +1006,7 @@ int FileStore::clone(coll_t cid, pobject_t oldoid, pobject_t newoid) if (n < 0) return -errno; int r = 0; - if (btrfs_fd >= 0) + if (btrfs) r = ::ioctl(n, BTRFS_IOC_CLONE, o); else { struct stat st; diff --git a/src/os/FileStore.h b/src/os/FileStore.h index e5906b7f45a..1138d4a888b 100644 --- a/src/os/FileStore.h +++ b/src/os/FileStore.h @@ -38,7 +38,9 @@ class FileStore : public JournalingObjectStore { string basedir; __u64 fsid; - int btrfs_fd; // >= if btrfs + bool btrfs; + bool btrfs_trans_start_end; + int lock_fd; // fake attrs? FakeAttrs attrs; @@ -75,7 +77,8 @@ class FileStore : public JournalingObjectStore { public: FileStore(const char *base) : basedir(base), - btrfs_fd(-1), + btrfs(false), btrfs_trans_start_end(false), + lock_fd(-1), attrs(this), fake_attrs(false), collections(this), fake_collections(false), stop(false), sync_thread(this) { } @@ -84,11 +87,12 @@ class FileStore : public JournalingObjectStore { int umount(); int mkfs(); - int transaction_start(); - void transaction_end(int id); - int statfs(struct statfs *buf); + int transaction_start(int len); + void transaction_end(int id); + unsigned apply_transaction(Transaction& t, Context *onsafe=0); + // ------------------ // objects int pick_object_revision_lt(pobject_t& oid) { diff --git a/src/os/JournalingObjectStore.h b/src/os/JournalingObjectStore.h index 9b0f5e0c6e3..ff31a373513 100644 --- a/src/os/JournalingObjectStore.h +++ b/src/os/JournalingObjectStore.h @@ -49,6 +49,15 @@ protected: commit_waiters[super_epoch].push_back(oncommit); } + void journal_transaction(Transaction &t, Context *onsafe) { + if (journal && journal->is_writeable()) { + bufferlist tbl; + t.encode(tbl); + journal->submit_entry(super_epoch, tbl, onsafe); + } else + queue_commit_waiter(onsafe); + } + void journal_write(coll_t cid, pobject_t oid, off_t off, size_t len, const bufferlist& bl, Context *onsafe) { if (journal && journal->is_writeable()) { Transaction t; diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index 575cec943a9..b3b2caeb7fb 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -113,6 +113,8 @@ public: list< map* > pattrsets; public: + int get_len() { return ops.size(); } // FIXME maintain a counter? + bool have_op() { return !ops.empty(); } @@ -341,11 +343,11 @@ public: * these stubs should be implemented if we want to use the * apply_transaction() below and we want atomic transactions. */ - virtual int transaction_start() { return 0; } + virtual int transaction_start(int len) { return 0; } virtual void transaction_end(int id) { } virtual unsigned apply_transaction(Transaction& t, Context *onsafe=0) { // non-atomic implementation - int id = transaction_start(); + int id = transaction_start(t.get_len()); while (t.have_op()) { int op = t.get_op(); switch (op) { diff --git a/src/vstartnew.sh b/src/vstartnew.sh index 6fc0eab805b..67a4e399f49 100755 --- a/src/vstartnew.sh +++ b/src/vstartnew.sh @@ -40,7 +40,7 @@ $CEPH_BIN/cmon -d mondata/mon0 --debug_mon 20 --debug_ms 1 $CEPH_BIN/osdmaptool --clobber --createsimple .ceph_monmap 4 --print .ceph_osdmap # --pgbits 2 $CEPH_BIN/cmonctl osd setmap -i .ceph_osdmap -for osd in 0 #1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +for osd in 0 1 #2 3 4 5 6 7 8 9 10 11 12 13 14 15 do $CEPH_BIN/cosd --mkfs_for_osd $osd dev/osd$osd # initialize empty object store #valgrind --tool=massif $CEPH_BIN/cosd dev/osd$osd --debug_ms 1 --debug_osd 20 --debug_filestore 10 1>out/o$osd & #--debug_osd 40