From 24113c941c7857ee019a1967b450f9cd2fc45ca8 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Fri, 11 Jul 2008 13:28:29 -0700 Subject: [PATCH] client: basic objectacher read/write support --- src/client/Client.cc | 27 +++++++++++------- src/config.cc | 3 +- src/config.h | 1 + src/include/ceph_fs.h | 11 ++++---- src/mds/MDCache.cc | 3 +- src/mds/Server.cc | 60 ++++++++++++++++++++++++++++++++++++++++ src/mds/Server.h | 1 + src/mds/snap.cc | 30 ++++++++++++++++++++ src/mds/snap.h | 1 + src/osdc/ObjectCacher.cc | 19 +++++++------ src/osdc/ObjectCacher.h | 1 + 11 files changed, 131 insertions(+), 26 deletions(-) diff --git a/src/client/Client.cc b/src/client/Client.cc index 6d90da3c06fe4..5e2c4fada4c16 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -1590,6 +1590,8 @@ void Client::maybe_update_snaprealm(SnapRealm *realm, snapid_t snap_created, for (xlist::iterator p = realm->inodes_with_caps.begin(); !p.end(); ++p) { Inode *in = *p; check_caps(in, true); // force writeback of write caps + if (g_conf.client_oc) + _flush(in); } realm->snaps = snaps; // ok. @@ -3262,14 +3264,16 @@ int Client::_read(Fh *f, __s64 offset, __u64 size, bufferlist *bl) dout(10) << "readahead " << f->nr_consec_read << " reads " << f->consec_read_bytes << " bytes ... readahead " << offset << "~" << l << " (caller wants " << offset << "~" << size << ")" << dendl; - #warning bleh - //objectcacher->file_read(in->inode.ino, &in->inode.layout, offset, l, NULL, 0, 0); + objectcacher->file_read(in->inode.ino, &in->inode.layout, + CEPH_NOSNAP, in->snaprealm->snaps, + offset, l, NULL, 0, 0); dout(10) << "readahead initiated" << dendl; } // read (and possibly block) - #warning bleh - //r = objectcacher->file_read(in->inode.ino, &in->inode.layout, offset, size, bl, 0, onfinish); + r = objectcacher->file_read(in->inode.ino, &in->inode.layout, + CEPH_NOSNAP, in->snaprealm->snaps, + offset, size, bl, 0, onfinish); if (r == 0) { while (!done) @@ -3280,8 +3284,9 @@ int Client::_read(Fh *f, __s64 offset, __u64 size, bufferlist *bl) delete onfinish; } } else { - #warning bleh - //r = objectcacher->file_atomic_sync_read(in->inode.ino, &in->inode.layout, offset, size, bl, 0, client_lock); + r = objectcacher->file_atomic_sync_read(in->inode.ino, &in->inode.layout, + CEPH_NOSNAP, in->snaprealm->snaps, + offset, size, bl, 0, client_lock); } } else { @@ -3448,12 +3453,14 @@ int Client::_write(Fh *f, __s64 offset, __u64 size, const char *buf) objectcacher->wait_for_write(size, client_lock); // async, caching, non-blocking. - #warning bleh - //objectcacher->file_write(in->inode.ino, &in->inode.layout, offset, size, bl, 0); + objectcacher->file_write(in->inode.ino, &in->inode.layout, + CEPH_NOSNAP, in->snaprealm->snaps, + offset, size, bl, 0); } else { // atomic, synchronous, blocking. - #warning bleh - //objectcacher->file_atomic_sync_write(in->inode.ino, &in->inode.layout, offset, size, bl, 0, client_lock); + objectcacher->file_atomic_sync_write(in->inode.ino, &in->inode.layout, + CEPH_NOSNAP, in->snaprealm->snaps, + offset, size, bl, 0, client_lock); } } else { // simple, non-atomic sync write diff --git a/src/config.cc b/src/config.cc index b9a9740cdacf0..ea8c6fc1212d4 100644 --- a/src/config.cc +++ b/src/config.cc @@ -292,11 +292,12 @@ md_config_t g_conf = { client_readahead_min: 128*1024, // readahead at _least_ this much. client_readahead_max_bytes: 0,//8 * 1024*1024, client_readahead_max_periods: 4, // as multiple of file layout period (object size * num stripes) + client_snapdir: ".snap", fuse_direct_io: 0, fuse_ll: true, // --- objectcacher --- - client_oc: false,//until snaps are done... true, + client_oc: true, client_oc_size: 1024*1024* 64, // MB * n client_oc_max_dirty: 1024*1024* 48, // MB * n (dirty OR tx.. bigish) client_oc_target_dirty: 1024*1024* 8, // target dirty (keep this smallish) diff --git a/src/config.h b/src/config.h index 70ebe7ef0e6a5..faf311dea60da 100644 --- a/src/config.h +++ b/src/config.h @@ -158,6 +158,7 @@ struct md_config_t { loff_t client_readahead_min; loff_t client_readahead_max_bytes; loff_t client_readahead_max_periods; + const char *client_snapdir; int fuse_direct_io; bool fuse_ll; diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 40be35b755d35..c1e01c14cdcdd 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -532,9 +532,9 @@ struct ceph_mds_session_head { * & 0x10000 -> follow symlink (e.g. stat(), not lstat()). & & 0x100000 -> use weird ino/path trace */ -#define CEPH_MDS_OP_WRITE 0x01000 -#define CEPH_MDS_OP_FOLLOW_LINK 0x10000 -#define CEPH_MDS_OP_INO_PATH 0x100000 +#define CEPH_MDS_OP_WRITE 0x001000 +#define CEPH_MDS_OP_FOLLOW_LINK 0x010000 +#define CEPH_MDS_OP_INO_PATH 0x100000 enum { CEPH_MDS_OP_FINDINODE = 0x100100, @@ -566,8 +566,9 @@ enum { CEPH_MDS_OP_FSYNC = 0x00304, CEPH_MDS_OP_READDIR = 0x00305, - CEPH_MDS_OP_MKSNAP = 0x01010, - CEPH_MDS_OP_RMSNAP = 0x01011, + CEPH_MDS_OP_MKSNAP = 0x01400, + CEPH_MDS_OP_RMSNAP = 0x01401, + CEPH_MDS_OP_LSSNAP = 0x00402, }; static inline const char *ceph_mds_op_name(int op) diff --git a/src/mds/MDCache.cc b/src/mds/MDCache.cc index 974f04afc6b96..594c02eb395bb 100644 --- a/src/mds/MDCache.cc +++ b/src/mds/MDCache.cc @@ -950,11 +950,10 @@ CInode *MDCache::pick_inode_snap(CInode *in, snapid_t follows) if (snaps.empty()) return in; - CInode *t = 0; for (set::const_iterator p = snaps.upper_bound(follows); p != snaps.end(); p++) { - t = get_inode(in->ino(), *p); + CInode *t = get_inode(in->ino(), *p); if (t) { in = t; dout(10) << "pick_inode_snap snap " << *p << " found " << *in << dendl; diff --git a/src/mds/Server.cc b/src/mds/Server.cc index 88217a2b34242..ba87d5b7b9e7f 100644 --- a/src/mds/Server.cc +++ b/src/mds/Server.cc @@ -812,6 +812,9 @@ void Server::dispatch_client_request(MDRequest *mdr) // snaps + case CEPH_MDS_OP_LSSNAP: + handle_client_lssnap(mdr); + break; case CEPH_MDS_OP_MKSNAP: handle_client_mksnap(mdr); break; @@ -4632,6 +4635,63 @@ void Server::handle_client_openc(MDRequest *mdr) // snaps +void Server::handle_client_lssnap(MDRequest *mdr) +{ + MClientRequest *req = mdr->client_request; + + // traverse to path + vector trace; + int r = mdcache->path_traverse(mdr, req, + req->get_filepath(), trace, false, + MDS_TRAVERSE_FORWARD); + if (r > 0) return; + if (trace.empty()) r = -EINVAL; // can't snap root + if (r < 0) { + reply_request(mdr, r); + return; + } + CDentry *dn = trace[trace.size()-1]; + assert(dn); + if (!dn->is_auth()) { // fw to auth? + mdcache->request_forward(mdr, dn->authority().first); + return; + } + + // dir only + CInode *diri = dn->inode; + if (!dn->is_primary() || !diri->is_dir()) { + reply_request(mdr, -ENOTDIR); + return; + } + dout(10) << "lssnap " << req->get_path2() << " on " << *diri << dendl; + + // lock snap + set rdlocks, wrlocks, xlocks; + + // rdlock path + for (int i=0; i<(int)trace.size()-1; i++) + rdlocks.insert(&trace[i]->lock); + + // rdlock ancestor snaps + CInode *t = diri; + rdlocks.insert(&diri->snaplock); + while (t->get_parent_dn()) { + t = t->get_parent_dn()->get_dir()->get_inode(); + rdlocks.insert(&t->snaplock); + } + + if (!mds->locker->acquire_locks(mdr, rdlocks, wrlocks, xlocks)) + return; + + SnapRealm *realm = diri->find_snaprealm(); + bufferlist snapinfo; + realm->get_snap_info(snapinfo); + + MClientReply *reply = new MClientReply(req); + reply->set_dir_bl(snapinfo); + reply_request(mdr, reply); +} + void Server::handle_client_mksnap(MDRequest *mdr) { MClientRequest *req = mdr->client_request; diff --git a/src/mds/Server.h b/src/mds/Server.h index 42197467b031b..bc32818fa6792 100644 --- a/src/mds/Server.h +++ b/src/mds/Server.h @@ -159,6 +159,7 @@ public: void _rename_finish(MDRequest *mdr, CDentry *srcdn, CDentry *destdn, CDentry *straydn); + void handle_client_lssnap(MDRequest *mdr); void handle_client_mksnap(MDRequest *mdr); void handle_client_rmsnap(MDRequest *mdr); diff --git a/src/mds/snap.cc b/src/mds/snap.cc index f1635a43fb91f..4a893717259b8 100644 --- a/src/mds/snap.cc +++ b/src/mds/snap.cc @@ -134,6 +134,36 @@ const set& SnapRealm::update_snaps(snapid_t creating) } +void SnapRealm::get_snap_info(bufferlist& bl, snapid_t first, snapid_t last) +{ + dout(10) << "get_snap_info snaps " << get_snaps() << dendl; + + // include my snaps within interval [first,last] + for (map::iterator p = snaps.lower_bound(first); // first element >= first + p != snaps.end() && p->first <= last; + p++) + ::encode(p->second, bl); + + // include snaps for parents during intervals that intersect [first,last] + snapid_t thru = first; + for (map::iterator p = past_parents.lower_bound(first); + p != past_parents.end() && p->first >= first && p->second.first <= last; + p++) { + CInode *oldparent = mdcache->get_inode(p->second.dirino); + assert(oldparent); // call open_parents first! + assert(oldparent->snaprealm); + + thru = MIN(last, p->first); + oldparent->snaprealm->get_snap_info(bl, + MAX(first, p->second.first), + thru); + ++thru; + } + if (thru <= last && parent) + parent->get_snap_info(bl, thru, last); +} + + void SnapRealm::split_at(SnapRealm *child) { dout(10) << "split_at " << *child diff --git a/src/mds/snap.h b/src/mds/snap.h index 3116add8639ff..21555e5f82f19 100644 --- a/src/mds/snap.h +++ b/src/mds/snap.h @@ -124,6 +124,7 @@ struct SnapRealm { void build_snap_set(set& s, snapid_t first, snapid_t last); const set& get_snaps(); const vector& get_snap_vector(); + void get_snap_info(bufferlist& snapinfo, snapid_t first=0, snapid_t last=CEPH_NOSNAP); const set& update_snaps(snapid_t adding=0); snapid_t get_latest_snap() { const set &snaps = get_snaps(); diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index ae710cc1bae8a..9bc3865f8b450 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -23,6 +23,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::split(BufferHead *left, off_t of ObjectCacher::BufferHead *right = new BufferHead(this); right->last_write_tid = left->last_write_tid; right->set_state(left->get_state()); + right->snaps = left->snaps; off_t newleftlen = off - left->start(); right->set_start(off); @@ -401,9 +402,10 @@ void ObjectCacher::bh_read(BufferHead *bh) C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); // go - #warning bleh - //objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), &onfinish->bl, 0, - //onfinish); + objecter->read(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), + bh->snaps, + &onfinish->bl, 0, + onfinish); } void ObjectCacher::bh_read_finish(object_t oid, off_t start, size_t length, bufferlist &bl) @@ -491,11 +493,9 @@ void ObjectCacher::bh_write(BufferHead *bh) C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); // go - tid_t tid = - 0; - #warning bleh - //objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), bh->bl, 0, - // onack, oncommit); + tid_t tid = objecter->write(bh->ob->get_oid(), bh->start(), bh->length(), bh->ob->get_layout(), + bh->snaps, bh->bl, 0, + onack, oncommit); // set bh last_write_tid onack->tid = tid; @@ -748,6 +748,7 @@ int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) for (map::iterator bh_it = missing.begin(); bh_it != missing.end(); bh_it++) { + bh_it->second->snaps = rd->snaps; bh_read(bh_it->second); if (success && onfinish) { dout(10) << "readx missed, waiting on " << *bh_it->second @@ -761,6 +762,7 @@ int ObjectCacher::readx(Objecter::OSDRead *rd, inodeno_t ino, Context *onfinish) for (map::iterator bh_it = rx.begin(); bh_it != rx.end(); bh_it++) { + bh_it->second->snaps = rd->snaps; touch_bh(bh_it->second); // bump in lru, so we don't lose it. if (success && onfinish) { dout(10) << "readx missed, waiting on " << *bh_it->second @@ -878,6 +880,7 @@ int ObjectCacher::writex(Objecter::OSDWrite *wr, inodeno_t ino) // map it all into a single bufferhead. BufferHead *bh = o->map_write(wr); + bh->snaps = wr->snaps; // adjust buffer pointers (ie "copy" data into my cache) // this is over a single ObjectExtent, so we know that diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h index 73235c4817ebc..beda0e5716cfb 100644 --- a/src/osdc/ObjectCacher.h +++ b/src/osdc/ObjectCacher.h @@ -45,6 +45,7 @@ class ObjectCacher { bufferlist bl; tid_t last_write_tid; // version of bh (if non-zero) utime_t last_write; + vector snaps; map< off_t, list > waitfor_read; -- 2.39.5