From fc057ff8a7aca883dcc508290f711a091cb712dd Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Wed, 6 May 2009 13:12:29 -0700 Subject: [PATCH] osd: move .snap out of object_t This makes the snap versioning completely orthogonal to the logical object name (object_t). This is key since eventually object_t won't be structured. And the old way made for an awkward interface anyway. Also killed the .snap = 0 special casing, which AFAICS was useless. --- src/Makefile.am | 1 - src/TODO | 10 + src/client/Client.cc | 4 +- src/client/SyntheticClient.cc | 6 +- src/include/ceph_fs.h | 1 - src/include/object.h | 140 +++++--- src/include/pobject.h | 81 ----- src/include/rados.h | 7 +- src/include/types.h | 23 -- src/kernel/debugfs.c | 8 +- src/kernel/osd_client.c | 2 +- src/mds/CDir.cc | 2 +- src/mds/CInode.cc | 2 +- src/mds/MDSTable.cc | 2 +- src/mds/SessionMap.cc | 2 +- src/messages/MOSDOp.h | 3 + src/os/FileStore.cc | 46 +-- src/os/ObjectStore.h | 1 - src/osd/Ager.cc | 2 +- src/osd/OSD.cc | 24 +- src/osd/OSD.h | 4 +- src/osd/PG.cc | 79 +++-- src/osd/PG.h | 109 +++---- src/osd/ReplicatedPG.cc | 581 +++++++++++++++++----------------- src/osd/ReplicatedPG.h | 48 +-- src/osd/osd_types.h | 13 +- src/osdc/Filer.cc | 8 +- src/osdc/Filer.h | 21 +- src/osdc/Journaler.cc | 2 +- src/osdc/ObjectCacher.cc | 78 +++-- src/osdc/ObjectCacher.h | 57 ++-- src/osdc/Objecter.h | 35 +- src/streamtest.cc | 2 +- 33 files changed, 695 insertions(+), 709 deletions(-) delete mode 100644 src/include/pobject.h diff --git a/src/Makefile.am b/src/Makefile.am index d6a71f02bedcc..95148272389e8 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -350,7 +350,6 @@ noinst_HEADERS = \ include/nstring.h\ include/object.h\ include/page.h\ - include/pobject.h\ include/rangeset.h\ include/rados.h\ include/statlite.h\ diff --git a/src/TODO b/src/TODO index 14ef8cb771d01..bea8cf6de90bb 100644 --- a/src/TODO +++ b/src/TODO @@ -42,6 +42,16 @@ v0.9 - make mds exhert memory pressure on client caps, leases - optionally separate osd interfaces (ips) for clients and osds (replication, peering, etc.) +rados +- move snap out of object_t +- create/destroy pg_pools +- autosize pg_pools? +- security +- c library glue to c3 +- pyexec? +- + + later - client reconnect after long eviction; and slow delayed reconnect - ENOSPC diff --git a/src/client/Client.cc b/src/client/Client.cc index c6ddd0a9fd5a6..6361cba4eb6d4 100644 --- a/src/client/Client.cc +++ b/src/client/Client.cc @@ -2110,7 +2110,7 @@ void Client::handle_cap_trunc(Inode *in, MClientCaps *m) m->get_size() < in->inode.size) { // map range to objects vector ls; - filer->file_to_extents(in->inode.ino, &in->inode.layout, CEPH_NOSNAP, + filer->file_to_extents(in->inode.ino, &in->inode.layout, m->get_size(), in->inode.size - m->get_size(), ls); objectcacher->truncate_set(in->inode.ino, ls); @@ -5028,7 +5028,7 @@ int Client::enumerate_layout(int fd, vector& result, Inode *in = f->inode; // map to a list of extents - filer->file_to_extents(in->inode.ino, &in->inode.layout, CEPH_NOSNAP, offset, length, result); + filer->file_to_extents(in->inode.ino, &in->inode.layout, offset, length, result); dout(3) << "enumerate_layout(" << fd << ", " << length << ", " << offset << ") = 0" << dendl; return 0; diff --git a/src/client/SyntheticClient.cc b/src/client/SyntheticClient.cc index db10860889ff7..34b6bd62fa42f 100644 --- a/src/client/SyntheticClient.cc +++ b/src/client/SyntheticClient.cc @@ -1347,7 +1347,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only) lock.Lock(); ceph_object_layout layout = client->osdmap->make_object_layout(oid, CEPH_CASDATA_RULE); __u64 size; - client->objecter->stat(oid, layout, &size, 0, new C_SafeCond(&lock, &cond, &ack)); + client->objecter->stat(oid, layout, CEPH_NOSNAP, &size, 0, new C_SafeCond(&lock, &cond, &ack)); while (!ack) cond.Wait(lock); lock.Unlock(); } @@ -1360,7 +1360,7 @@ int SyntheticClient::play_trace(Trace& t, string& prefix, bool metadata_only) lock.Lock(); ceph_object_layout layout = client->osdmap->make_object_layout(oid, CEPH_CASDATA_RULE); bufferlist bl; - client->objecter->read(oid, layout, off, len, &bl, 0, new C_SafeCond(&lock, &cond, &ack)); + client->objecter->read(oid, layout, off, len, CEPH_NOSNAP, &bl, 0, new C_SafeCond(&lock, &cond, &ack)); while (!ack) cond.Wait(lock); lock.Unlock(); } @@ -2275,7 +2275,7 @@ int SyntheticClient::object_rw(int nobj, int osize, int wrpc, } else { dout(10) << "read from " << oid << dendl; bufferlist inbl; - client->objecter->read(oid, layout, 0, osize, &inbl, 0, + client->objecter->read(oid, layout, 0, osize, CEPH_NOSNAP, &inbl, 0, new C_Ref(lock, cond, &unack)); } client->client_lock.Unlock(); diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index 1c890b8d1e366..b9e53760af0bc 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -907,5 +907,4 @@ struct ceph_mds_snap_realm { } __attribute__ ((packed)); /* followed by my snap list, then prior parent snap list */ - #endif diff --git a/src/include/object.h b/src/include/object.h index 9693a23c26d6f..2f8ac32ecb8ef 100644 --- a/src/include/object.h +++ b/src/include/object.h @@ -35,31 +35,30 @@ struct object_t { struct { uint64_t ino; // "file" identifier uint32_t bno; // "block" in that "file" - uint64_t snap; // snap revision. + uint64_t pad; } __attribute__ ((packed)); } __attribute__ ((packed)); - object_t() : ino(0), bno(0), snap(0) {} - object_t(uint64_t i, uint32_t b) : ino(i), bno(b), snap(0) {} - object_t(uint64_t i, uint32_t b, uint64_t r) : ino(i), bno(b), snap(r) {} + object_t() : ino(0), bno(0), pad(0) {} + object_t(uint64_t i, uint32_t b) : ino(i), bno(b), pad(0) {} // IMPORTANT: make this match struct ceph_object **** object_t(const ceph_object& co) { ino = co.ino; bno = co.bno; - snap = co.snap; + pad = co.pad; } operator ceph_object() { ceph_object oid; oid.ino = ino; oid.bno = bno; - oid.snap = snap; + oid.pad = pad; return oid; } void encode(bufferlist &bl) const { ::encode(ino, bl); ::encode(bno, bl); - ::encode(snap, bl); + ::encode(pad, bl); } void decode(bufferlist::iterator &bl) { __u64 i, r; @@ -69,7 +68,7 @@ struct object_t { ::decode(r, bl); ino = i; bno = b; - snap = r; + pad = r; } } __attribute__ ((packed)); WRITE_CLASS_ENCODER(object_t) @@ -99,12 +98,6 @@ inline ostream& operator<<(ostream& out, const object_t o) { out.fill('0'); out << setw(8) << o.bno; out.unsetf(ios::right); - if (o.snap) { - if (o.snap == CEPH_NOSNAP) - out << ".head"; - else - out << '.' << o.snap; - } out << dec; return out; } @@ -114,61 +107,136 @@ namespace __gnu_cxx { size_t operator()(const object_t &r) const { static rjhash H; static rjhash I; - return H(r.ino) ^ I(r.bno) ^ H(r.snap); + return H(r.ino) ^ I(r.bno) ^ H(r.pad); } }; } +// --------------------------- +// snaps + +struct snapid_t { + __u64 val; + snapid_t(__u64 v=0) : val(v) {} + snapid_t operator+=(snapid_t o) { val += o.val; return *this; } + snapid_t operator++() { ++val; return *this; } + operator __u64() const { return val; } +}; + +inline void encode(snapid_t i, bufferlist &bl) { encode(i.val, bl); } +inline void decode(snapid_t &i, bufferlist::iterator &p) { decode(i.val, p); } + +inline ostream& operator<<(ostream& out, snapid_t s) { + if (s == CEPH_NOSNAP) + return out << "head"; + else if (s == CEPH_SNAPDIR) + return out << "snapdir"; + else + return out << hex << s.val << dec; +} + + +struct sobject_t { + object_t oid; + snapid_t snap; + + sobject_t() : snap(0) {} + sobject_t(object_t o, snapid_t s) : oid(o), snap(s) {} + + void encode(bufferlist& bl) const { + ::encode(oid, bl); + ::encode(snap, bl); + } + void decode(bufferlist::iterator& bl) { + ::decode(oid, bl); + ::decode(snap, bl); + } +}; +WRITE_CLASS_ENCODER(sobject_t) + +inline bool operator==(const sobject_t l, const sobject_t r) { + return l.oid == r.oid && l.snap == r.snap; +} +inline bool operator!=(const sobject_t l, const sobject_t r) { + return l.oid != r.oid || l.snap != r.snap; +} +inline bool operator>(const sobject_t l, const sobject_t r) { + return l.oid > r.oid || (l.oid == r.oid && l.snap > r.snap); +} +inline bool operator<(const sobject_t l, const sobject_t r) { + return l.oid < r.oid || (l.oid == r.oid && l.snap < r.snap); +} +inline bool operator>=(const sobject_t l, const sobject_t r) { + return l.oid > r.oid || (l.oid == r.oid && l.snap >= r.snap); +} +inline bool operator<=(const sobject_t l, const sobject_t r) { + return l.oid < r.oid || (l.oid == r.oid && l.snap <= r.snap); +} +inline ostream& operator<<(ostream& out, const sobject_t o) { + return out << o.oid << "." << o.snap; +} + +namespace __gnu_cxx { + template<> struct hash { + size_t operator()(const sobject_t &r) const { + static hash H; + static rjhash I; + return H(r.oid) ^ I(r.snap); + } + }; +} + +// --------------------------- + +typedef sobject_t pobject_t; + struct coll_t { - __u64 high; - __u64 low; + __u64 pgid; + snapid_t snap; - coll_t(__u64 h=0, __u64 l=0) : high(h), low(l) {} + coll_t(__u64 p=0, snapid_t s=0) : pgid(p), snap(s) {} void encode(bufferlist& bl) const { - ::encode(high, bl); - ::encode(low, bl); + ::encode(pgid, bl); + ::encode(snap, bl); } void decode(bufferlist::iterator& bl) { - __u64 h, l; - ::decode(h, bl); - ::decode(l, bl); - high = h; - low = l; + ::decode(pgid, bl); + ::decode(snap, bl); } -} __attribute__ ((packed)); +}; WRITE_CLASS_ENCODER(coll_t) inline ostream& operator<<(ostream& out, const coll_t& c) { - return out << hex << c.high << '.' << c.low << dec; + return out << hex << c.pgid << '.' << c.snap << dec; } inline bool operator<(const coll_t& l, const coll_t& r) { - return l.high < r.high || (l.high == r.high && l.low < r.low); + return l.pgid < r.pgid || (l.pgid == r.pgid && l.snap < r.snap); } inline bool operator<=(const coll_t& l, const coll_t& r) { - return l.high < r.high || (l.high == r.high && l.low <= r.low); + return l.pgid < r.pgid || (l.pgid == r.pgid && l.snap <= r.snap); } inline bool operator==(const coll_t& l, const coll_t& r) { - return l.high == r.high && l.low == r.low; + return l.pgid == r.pgid && l.snap == r.snap; } inline bool operator!=(const coll_t& l, const coll_t& r) { - return l.high != r.high || l.low != r.low; + return l.pgid != r.pgid || l.snap != r.snap; } inline bool operator>(const coll_t& l, const coll_t& r) { - return l.high > r.high || (l.high == r.high && l.low > r.low); + return l.pgid > r.pgid || (l.pgid == r.pgid && l.snap > r.snap); } inline bool operator>=(const coll_t& l, const coll_t& r) { - return l.high > r.high || (l.high == r.high && l.low >= r.low); + return l.pgid > r.pgid || (l.pgid == r.pgid && l.snap >= r.snap); } - namespace __gnu_cxx { template<> struct hash { size_t operator()(const coll_t &c) const { - static rjhash H; - return H(c.high) ^ H(c.low); + static rjhash H; + static rjhash I; + return H(c.pgid) ^ I(c.snap); } }; } diff --git a/src/include/pobject.h b/src/include/pobject.h deleted file mode 100644 index d5fe7319cccc2..0000000000000 --- a/src/include/pobject.h +++ /dev/null @@ -1,81 +0,0 @@ -// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- -// vim: ts=8 sw=2 smarttab -/* - * Ceph - scalable distributed file system - * - * Copyright (C) 2004-2006 Sage Weil - * - * This is free software; you can redistribute it and/or - * modify it under the terms of the GNU Lesser General Public - * License version 2.1, as published by the Free Software - * Foundation. See file COPYING. - * - */ - -#ifndef __CEPH_POBJECT_H -#define __CEPH_POBJECT_H - -#include "object.h" - -/* - * "physical" object stored in an individual OSD's object store. - * includes fields to describe which volume the logical object_t - * belongs to, and/or a specific part of the object (if striped - * or encoded for redundancy, etc.). - */ -struct pobject_t { - uint32_t volume; // "volume" - uint32_t rank; // rank/stripe id (e.g. for parity encoding) - object_t oid; // logical object - pobject_t() : volume(0), rank(0) {} - pobject_t(uint16_t v, uint16_t r, object_t o) : volume(v), rank(r), oid(o) {} - void encode(bufferlist &bl) const { - ::encode(volume, bl); - ::encode(rank, bl); - ::encode(oid, bl); - } - void decode(bufferlist::iterator &bl) { - __u32 v, r; - ::decode(v, bl); - ::decode(r, bl); - volume = v; - rank = r; - oid.decode(bl); - } -} __attribute__ ((packed)); -WRITE_CLASS_ENCODER(pobject_t) - -inline ostream& operator<<(ostream& out, const pobject_t o) { - return out << o.volume << '/' << o.rank << '/' << o.oid; -} - -inline bool operator==(const pobject_t l, const pobject_t r) { - return memcmp(&l, &r, sizeof(l)) == 0; -} -inline bool operator!=(const pobject_t l, const pobject_t r) { - return memcmp(&l, &r, sizeof(l)) != 0; -} -inline bool operator>(const pobject_t l, const pobject_t r) { - return memcmp(&l, &r, sizeof(l)) > 0; -} -inline bool operator>=(const pobject_t l, const pobject_t r) { - return memcmp(&l, &r, sizeof(l)) >= 0; -} -inline bool operator<(const pobject_t l, const pobject_t r) { - return memcmp(&l, &r, sizeof(l)) < 0; -} -inline bool operator<=(const pobject_t l, const pobject_t r) { - return memcmp(&l, &r, sizeof(l)) <= 0; -} - -namespace __gnu_cxx { - template<> struct hash { - size_t operator()(const pobject_t &r) const { - static rjhash H; - static rjhash I; - return I(r.volume) ^ I(r.rank) ^ H(r.oid.ino) ^ I(r.oid.bno) ^ H(r.oid.snap); - } - }; -} - -#endif diff --git a/src/include/rados.h b/src/include/rados.h index 7718dcef77f11..c3c7a4e32507d 100644 --- a/src/include/rados.h +++ b/src/include/rados.h @@ -29,7 +29,7 @@ struct ceph_object { struct { __le64 ino; /* inode "file" identifier */ __le32 bno; /* "block" (object) in that "file" */ - __le64 snap; /* snapshot id. usually NOSNAP. */ + __le64 pad; } __attribute__ ((packed)); }; } __attribute__ ((packed)); @@ -338,7 +338,10 @@ struct ceph_osd_request_head { struct ceph_eversion reassert_version; /* writer's snap context */ - __le64 snap_seq; + union { + __le64 snap_seq; + __le64 snapid; + }; __le32 num_snaps; /* read or mutation */ diff --git a/src/include/types.h b/src/include/types.h index a5dcf464c8901..ddf894c0672cd 100644 --- a/src/include/types.h +++ b/src/include/types.h @@ -282,29 +282,6 @@ inline unsigned char MODE_TO_DT(int mode) { -// snaps -struct snapid_t { - __u64 val; - snapid_t(__u64 v=0) : val(v) {} - snapid_t operator+=(snapid_t o) { val += o.val; return *this; } - snapid_t operator++() { ++val; return *this; } - operator __u64() const { return val; } -}; - -inline void encode(snapid_t i, bufferlist &bl) { encode(i.val, bl); } -inline void decode(snapid_t &i, bufferlist::iterator &p) { decode(i.val, p); } - -inline ostream& operator<<(ostream& out, snapid_t s) { - if (s == CEPH_NOSNAP) - return out << "head"; - else if (s == CEPH_SNAPDIR) - return out << "snapdir"; - else - return out << hex << s.val << dec; -} - - - struct SnapRealmInfo { mutable ceph_mds_snap_realm h; vector my_snaps; diff --git a/src/kernel/debugfs.c b/src/kernel/debugfs.c index ae2a6a25ca65a..c571cfb4ad2fc 100644 --- a/src/kernel/debugfs.c +++ b/src/kernel/debugfs.c @@ -341,10 +341,10 @@ static int osdc_show(struct seq_file *s, void *p) head = req->r_request->front.iov_base; op = (void *)(head + 1); - seq_printf(s, "%llx.%08x\t%lld\t", - le64_to_cpu(head->oid.ino), - le32_to_cpu(head->oid.bno), - le64_to_cpu(head->oid.snap)); + seq_printf(s, "%llx.%08x.%llx\t", + le64_to_cpu(head->oid.ino), + le32_to_cpu(head->oid.bno), + le64_to_cpu(head->oid.pad)); if (req->r_reassert_version.epoch) seq_printf(s, "%u'%llu\t", diff --git a/src/kernel/osd_client.c b/src/kernel/osd_client.c index 916bc45b0c7fa..f87f9ccf756bd 100644 --- a/src/kernel/osd_client.c +++ b/src/kernel/osd_client.c @@ -35,7 +35,7 @@ static void calc_layout(struct ceph_osd_client *osdc, /* object extent? */ reqhead->oid.ino = cpu_to_le64(vino.ino); - reqhead->oid.snap = cpu_to_le64(vino.snap); + reqhead->snapid = cpu_to_le64(vino.snap); ceph_calc_file_object_mapping(layout, off, plen, &reqhead->oid, &objoff, &objlen); diff --git a/src/mds/CDir.cc b/src/mds/CDir.cc index a97db84dc858b..f8c1efb152719 100644 --- a/src/mds/CDir.cc +++ b/src/mds/CDir.cc @@ -1042,7 +1042,7 @@ void CDir::fetch(Context *c, bool ignore_authpinnability) OSDMap *osdmap = cache->mds->objecter->osdmap; ceph_object_layout ol = osdmap->make_object_layout(oid, cache->mds->mdsmap->get_metadata_pg_pool()); - cache->mds->objecter->read_full(oid, ol, &fin->bl, 0, fin); + cache->mds->objecter->read_full(oid, ol, CEPH_NOSNAP, &fin->bl, 0, fin); } void CDir::_fetched(bufferlist &bl) diff --git a/src/mds/CInode.cc b/src/mds/CInode.cc index 65f9fd435e2f3..6af6123ee3a5a 100644 --- a/src/mds/CInode.cc +++ b/src/mds/CInode.cc @@ -694,7 +694,7 @@ void CInode::fetch(Context *fin) mdcache->mds->mdsmap->get_metadata_pg_pool()); mdcache->mds->objecter->read(oid, ol, - rd, &c->bl, 0, c ); + rd, CEPH_NOSNAP, &c->bl, 0, c ); } void CInode::_fetched(bufferlist& bl, Context *fin) diff --git a/src/mds/MDSTable.cc b/src/mds/MDSTable.cc index ff80fcb38f58c..8e36e0ff4a8a8 100644 --- a/src/mds/MDSTable.cc +++ b/src/mds/MDSTable.cc @@ -125,7 +125,7 @@ void MDSTable::load(Context *onfinish) OSDMap *osdmap = mds->objecter->osdmap; ceph_object_layout ol = osdmap->make_object_layout(oid, mds->mdsmap->get_metadata_pg_pool()); - mds->objecter->read_full(oid, ol, &c->bl, 0, c); + mds->objecter->read_full(oid, ol, CEPH_NOSNAP, &c->bl, 0, c); } void MDSTable::load_2(int r, bufferlist& bl, Context *onfinish) diff --git a/src/mds/SessionMap.cc b/src/mds/SessionMap.cc index 13be59b354e19..f7d05e6aed20e 100644 --- a/src/mds/SessionMap.cc +++ b/src/mds/SessionMap.cc @@ -67,7 +67,7 @@ void SessionMap::load(Context *onload) OSDMap *osdmap = mds->objecter->osdmap; ceph_object_layout ol = osdmap->make_object_layout(oid, mds->mdsmap->get_metadata_pg_pool()); - mds->objecter->read_full(oid, ol, &c->bl, 0, c); + mds->objecter->read_full(oid, ol, CEPH_NOSNAP, &c->bl, 0, c); } void SessionMap::_load_finish(bufferlist &bl) diff --git a/src/messages/MOSDOp.h b/src/messages/MOSDOp.h index a68754687078b..72d46d5a401b4 100644 --- a/src/messages/MOSDOp.h +++ b/src/messages/MOSDOp.h @@ -39,6 +39,9 @@ public: friend class MOSDOpReply; + // read + snapid_t get_snapid() { return snapid_t(head.snapid); } + // writ snapid_t get_snap_seq() { return snapid_t(head.snap_seq); } vector &get_snaps() { return snaps; } void set_snap_seq(snapid_t s) { head.snap_seq = s; } diff --git a/src/os/FileStore.cc b/src/os/FileStore.cc index e2193148f447b..7d91122e6ea09 100644 --- a/src/os/FileStore.cc +++ b/src/os/FileStore.cc @@ -169,16 +169,21 @@ int FileStore::statfs(struct statfs *buf) /* * sorry, these are sentitive to the pobject_t and coll_t typing. */ + + // 11111111112222222222333333333344444444445555555555 + // 012345678901234567890123456789012345678901234567890123456789 + // yyyyyyyyyyyyyyyy.zzzzzzzz.a_s + void FileStore::append_oname(const pobject_t &oid, char *s) { //assert(sizeof(oid) == 28); char *t = s + strlen(s); #ifdef __LP64__ - sprintf(t, "/%04x.%04x.%016lx.%08x.%lx", - oid.volume, oid.rank, oid.oid.ino, oid.oid.bno, oid.oid.snap); + sprintf(t, "/%016lx.%08x.%lx_%lx", + oid.oid.ino, oid.oid.bno, oid.oid.pad, (__u64)oid.snap); #else - sprintf(t, "/%04x.%04x.%016llx.%08x.%llx", - oid.volume, oid.rank, oid.oid.ino, oid.oid.bno, oid.oid.snap); + sprintf(t, "/%08x_%016llx.%08x.%llx_%llx", + oid.oid.ino, oid.oid.bno, oid.oid.pad, (__u64)oid.oid.snap); #endif //parse_object(t+1); } @@ -186,30 +191,27 @@ void FileStore::append_oname(const pobject_t &oid, char *s) bool FileStore::parse_object(char *s, pobject_t& o) { //assert(sizeof(o) == 28); - if (strlen(s) < 36 || - s[4] != '.' || - s[9] != '.' || - s[26] != '.' || - s[35] != '.') + if (strlen(s) < 29 || + s[16] != '.' || + s[25] != '.') return false; - o.volume = strtoull(s, 0, 16); - assert(s[4] == '.'); - o.rank = strtoull(s+5, 0, 16); - assert(s[9] == '.'); - o.oid.ino = strtoull(s+10, 0, 16); - assert(s[26] == '.'); - o.oid.bno = strtoull(s+27, 0, 16); - assert(s[35] == '.'); - o.oid.snap = strtoull(s+36, 0, 16); + o.oid.ino = strtoull(s, &s, 16); + o.oid.bno = strtoull(s+1, &s, 16); + o.oid.pad = strtoull(s+1, &s, 16); + o.snap = strtoull(s, &s, 16); return true; } + // 11111111112222222222333 + // 012345678901234567890123456789012 + // pppppppppppppppp.ssssssssssssssss + bool FileStore::parse_coll(char *s, coll_t& c) { if (strlen(s) == 33 && s[16] == '.') { s[16] = 0; - c.high = strtoull(s, 0, 16); - c.low = strtoull(s+17, 0, 16); + c.pgid = strtoull(s, 0, 16); + c.snap = strtoull(s+17, 0, 16); return true; } else return false; @@ -219,9 +221,9 @@ void FileStore::get_cdir(coll_t cid, char *s) { assert(sizeof(cid) == 16); #ifdef __LP64__ - sprintf(s, "%s/%016lx.%016lx", basedir.c_str(), cid.high, cid.low); + sprintf(s, "%s/%016lx.%016lx", basedir.c_str(), cid.pgid, (__u64)cid.snap); #else - sprintf(s, "%s/%016llx.%016llx", basedir.c_str(), cid.high, cid.low); + sprintf(s, "%s/%016llx.%016llx", basedir.c_str(), cid.pgid, (__u64)cid.snap); #endif } diff --git a/src/os/ObjectStore.h b/src/os/ObjectStore.h index ac077d449fda3..4f5a107bf285d 100644 --- a/src/os/ObjectStore.h +++ b/src/os/ObjectStore.h @@ -19,7 +19,6 @@ #include "include/types.h" #include "include/Context.h" #include "include/buffer.h" -#include "include/pobject.h" #include "include/nstring.h" #include "include/Distribution.h" diff --git a/src/osd/Ager.cc b/src/osd/Ager.cc index b5530488706e3..36f9b8b1fbe96 100644 --- a/src/osd/Ager.cc +++ b/src/osd/Ager.cc @@ -229,7 +229,7 @@ void Ager::age(int time, // init size distn (once) if (!did_distn) { did_distn = true; - age_cur_oid = pobject_t(0, 0, object_t(0,1)); + age_cur_oid = pobject_t(object_t(0,1), 0); file_size_distn.add(1, 19.0758125+0.65434375); file_size_distn.add(512, 35.6566); file_size_distn.add(1024, 27.7271875); diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index d0361907a4819..be08e024cbc8f 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -152,7 +152,7 @@ int OSD::mkfs(const char *dev, const char *jdev, ceph_fsid_t fsid, int whoami) utime_t start = g_clock.now(); for (int i=0; i<1000; i++) { ObjectStore::Transaction t; - t.write(0, pobject_t(0, 0, object_t(999,i)), 0, bl.length(), bl); + t.write(0, pobject_t(object_t(999,i), 0), 0, bl.length(), bl); store->apply_transaction(t); } store->sync(); @@ -161,7 +161,7 @@ int OSD::mkfs(const char *dev, const char *jdev, ceph_fsid_t fsid, int whoami) cout << "measured " << (1000.0 / (double)end) << " mb/sec" << std::endl; ObjectStore::Transaction tr; for (int i=0; i<1000; i++) - tr.remove(0, pobject_t(0, 0, object_t(999,i))); + tr.remove(0, pobject_t(object_t(999,i), 0)); store->apply_transaction(tr); // set osd weight @@ -703,9 +703,9 @@ void OSD::load_pgs() it++) { if (*it == 0) continue; - if (it->low != 0) + if (it->snap != 0) continue; - pg_t pgid = it->high; + pg_t pgid = it->pgid; PG *pg = _open_lock_pg(pgid); // read pg state, log @@ -3386,16 +3386,15 @@ void OSD::handle_op(MOSDOp *op) return; } - if (op->get_oid().snap > 0) { + if (op->get_snapid() > 0) { // snap read. hrm. // are we missing a revision that we might need? // let's get them all. for (unsigned i=0; iget_snaps().size(); i++) { - object_t oid = op->get_oid(); - oid.snap = op->get_snaps()[i]; - if (pg->is_missing_object(oid)) { - dout(10) << "handle_op _may_ need missing rev " << oid << ", pulling" << dendl; - pg->wait_for_missing_object(op->get_oid(), op); + sobject_t soid(op->get_oid(), op->get_snaps()[i]); + if (pg->is_missing_object(soid)) { + dout(10) << "handle_op _may_ need missing rev " << soid << ", pulling" << dendl; + pg->wait_for_missing_object(soid, op); pg->unlock(); return; } @@ -3448,8 +3447,9 @@ void OSD::handle_op(MOSDOp *op) } // missing object? - if (pg->is_missing_object(op->get_oid())) { - pg->wait_for_missing_object(op->get_oid(), op); + sobject_t head(op->get_oid(), CEPH_NOSNAP); + if (pg->is_missing_object(head)) { + pg->wait_for_missing_object(head, op); pg->unlock(); return; } diff --git a/src/osd/OSD.h b/src/osd/OSD.h index d314a2cbecede..8790f90cb73d7 100644 --- a/src/osd/OSD.h +++ b/src/osd/OSD.h @@ -120,10 +120,10 @@ public: int get_nodeid() { return whoami; } static pobject_t get_osdmap_pobject_name(epoch_t epoch) { - return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0, epoch << 1)); + return pobject_t(object_t(0, epoch << 1), 0); } static pobject_t get_inc_osdmap_pobject_name(epoch_t epoch) { - return pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0, (epoch << 1) + 1)); + return pobject_t(object_t(0, (epoch << 1) + 1), 0); } diff --git a/src/osd/PG.cc b/src/osd/PG.cc index b84f770ca8f12..31214e6031b75 100644 --- a/src/osd/PG.cc +++ b/src/osd/PG.cc @@ -196,13 +196,13 @@ void PG::proc_replica_log(ObjectStore::Transaction& t, Info &oinfo, Log &olog, M eversion_t lu = oinfo.last_update; while (pp != olog.log.rend()) { Log::Entry& oe = *pp; - if (!log.objects.count(oe.oid)) { + if (!log.objects.count(oe.soid)) { dout(10) << " had " << oe << " new dne : divergent, ignoring" << dendl; ++pp; continue; } - Log::Entry& ne = *log.objects[oe.oid]; + Log::Entry& ne = *log.objects[oe.soid]; if (ne.version == oe.version) { dout(10) << " had " << oe << " new " << ne << " : match, stopping" << dendl; break; @@ -217,17 +217,17 @@ void PG::proc_replica_log(ObjectStore::Transaction& t, Info &oinfo, Log &olog, M } else { // old delete, new update. dout(20) << " had " << oe << " new " << ne << " : missing" << dendl; - omissing.add(ne.oid, ne.version, eversion_t()); + omissing.add(ne.soid, ne.version, eversion_t()); } } else { if (ne.is_delete()) { // old update, new delete dout(10) << " had " << oe << " new " << ne << " : new will supercede" << dendl; - omissing.rm(oe.oid, oe.version); + omissing.rm(oe.soid, oe.version); } else { // old update, new update dout(10) << " had " << oe << " new " << ne << " : new will supercede" << dendl; - omissing.revise_need(ne.oid, ne.version); + omissing.revise_need(ne.soid, ne.version); } } } @@ -266,12 +266,12 @@ void PG::proc_replica_log(ObjectStore::Transaction& t, Info &oinfo, Log &olog, M */ bool PG::merge_old_entry(ObjectStore::Transaction& t, Log::Entry& oe) { - if (log.objects.count(oe.oid)) { - Log::Entry &ne = *log.objects[oe.oid]; // new(er?) entry + if (log.objects.count(oe.soid)) { + Log::Entry &ne = *log.objects[oe.soid]; // new(er?) entry if (ne.version > oe.version) { dout(20) << "merge_old_entry had " << oe << " new " << ne << " : older, missing" << dendl; - assert(ne.is_delete() || missing.is_missing(ne.oid)); + assert(ne.is_delete() || missing.is_missing(ne.soid)); return false; } if (ne.version == oe.version) { @@ -286,17 +286,17 @@ bool PG::merge_old_entry(ObjectStore::Transaction& t, Log::Entry& oe) } else { // old delete, new update. dout(20) << "merge_old_entry had " << oe << " new " << ne << " : missing" << dendl; - assert(missing.is_missing(oe.oid)); + assert(missing.is_missing(oe.soid)); } } else { if (ne.is_delete()) { // old update, new delete dout(20) << "merge_old_entry had " << oe << " new " << ne << " : new delete supercedes" << dendl; - missing.rm(oe.oid, oe.version); + missing.rm(oe.soid, oe.version); } else { // old update, new update dout(20) << "merge_old_entry had " << oe << " new " << ne << " : new item supercedes" << dendl; - missing.revise_need(ne.oid, ne.version); + missing.revise_need(ne.soid, ne.version); } } } else { @@ -304,8 +304,8 @@ bool PG::merge_old_entry(ObjectStore::Transaction& t, Log::Entry& oe) dout(20) << "merge_old_entry had " << oe << " new dne : ok" << dendl; } else { dout(20) << "merge_old_entry had " << oe << " new dne : deleting" << dendl; - t.remove(info.pgid.to_coll(), pobject_t(info.pgid.pool(), 0, oe.oid)); - missing.rm(oe.oid, oe.version); + t.remove(info.pgid.to_coll(), oe.soid); + missing.rm(oe.soid, oe.version); } } return false; @@ -329,7 +329,7 @@ void PG::merge_log(ObjectStore::Transaction& t, } assert(log.backlog || log.top == eversion_t()); - hash_map old_objects; + hash_map old_objects; old_objects.swap(log.objects); // swap in other log and index @@ -362,7 +362,7 @@ void PG::merge_log(ObjectStore::Transaction& t, dout(10) << "merge_log merging " << ne << dendl; missing.add_next_event(ne); if (ne.is_delete()) - t.remove(info.pgid.to_coll(), pobject_t(info.pgid.pool(), 0, ne.oid)); + t.remove(info.pgid.to_coll(), ne.soid); } // find any divergent or removed items in old log. @@ -371,8 +371,8 @@ void PG::merge_log(ObjectStore::Transaction& t, p != olog.log.end(); p++) { Log::Entry &oe = *p; // old entry - if (old_objects.count(oe.oid) && - old_objects[oe.oid] == &oe) + if (old_objects.count(oe.soid) && + old_objects[oe.soid] == &oe) merge_old_entry(t, oe); } @@ -447,7 +447,7 @@ void PG::merge_log(ObjectStore::Transaction& t, log.index(ne); missing.add_next_event(ne); if (ne.is_delete()) - t.remove(info.pgid.to_coll(), pobject_t(info.pgid.pool(), 0, ne.oid)); + t.remove(info.pgid.to_coll(), ne.soid); } // move aside divergent items @@ -501,7 +501,7 @@ void PG::merge_log(ObjectStore::Transaction& t, void PG::search_for_missing(Log &olog, Missing &omissing, int fromosd) { // found items? - for (map::iterator p = missing.missing.begin(); + for (map::iterator p = missing.missing.begin(); p != missing.missing.end(); p++) { eversion_t need = p->second.need; @@ -541,10 +541,10 @@ bool PG::build_backlog_map(map& omap) for (vector::iterator it = olist.begin(); it != olist.end(); it++) { - pobject_t poid = pobject_t(info.pgid.pool(), 0, it->oid); + pobject_t poid = *it; Log::Entry e; - e.oid = it->oid; + e.soid = poid; bufferlist bv; osd->store->getattr(info.pgid.to_coll(), poid, OI_ATTR, bv); object_info_t oi(bv); @@ -552,7 +552,7 @@ bool PG::build_backlog_map(map& omap) e.prior_version = oi.prior_version; e.reqid = oi.last_reqid; e.mtime = oi.mtime; - if (poid.oid.snap && poid.oid.snap < CEPH_NOSNAP) { + if (e.soid.snap && e.soid.snap < CEPH_NOSNAP) { e.op = Log::Entry::CLONE; ::encode(oi.snaps, e.snaps); } else { @@ -606,8 +606,8 @@ void PG::assemble_backlog(map& omap) * - the prior_version is also already in the log * otherwise, we need to include it. */ - if (log.objects.count(be.oid)) { - Log::Entry *le = log.objects[be.oid]; + if (log.objects.count(be.soid)) { + Log::Entry *le = log.objects[be.soid]; assert(!le->is_delete()); // if it's a deletion, we are corrupt.. @@ -669,8 +669,8 @@ ostream& PG::IndexedLog::print(ostream& out) const for (list::const_iterator p = log.begin(); p != log.end(); p++) { - out << *p << " " << (logged_object(p->oid) ? "indexed":"NOT INDEXED") << std::endl; - assert(logged_object(p->oid)); + out << *p << " " << (logged_object(p->soid) ? "indexed":"NOT INDEXED") << std::endl; + assert(logged_object(p->soid)); assert(logged_req(p->reqid)); } return out; @@ -1319,7 +1319,7 @@ void PG::activate(ObjectStore::Transaction& t, while (log.complete_to->version < info.last_complete) log.complete_to++; assert(log.complete_to != log.log.end()); - log.last_requested = object_t(); + log.last_requested = sobject_t(); dout(10) << "activate - complete_to = " << log.complete_to->version << dendl; if (is_primary()) { dout(10) << "activate - starting recovery" << dendl; @@ -1781,28 +1781,27 @@ void PG::read_log(ObjectStore *store) dout(10) << "read_log checking for missing items over interval (" << info.last_complete << "," << info.last_update << "]" << dendl; - set did; + set did; for (list::reverse_iterator i = log.log.rbegin(); i != log.log.rend(); i++) { if (i->version <= info.last_complete) break; - if (did.count(i->oid)) continue; - did.insert(i->oid); + if (did.count(i->soid)) continue; + did.insert(i->soid); if (i->is_delete()) continue; - pobject_t poid(info.pgid.pool(), 0, i->oid); bufferlist bv; - int r = osd->store->getattr(info.pgid.to_coll(), poid, OI_ATTR, bv); + int r = osd->store->getattr(info.pgid.to_coll(), i->soid, OI_ATTR, bv); if (r >= 0) { object_info_t oi(bv); if (oi.version < i->version) { dout(15) << "read_log missing " << *i << " (have " << oi.version << ")" << dendl; - missing.add(i->oid, i->version, oi.version); + missing.add(i->soid, i->version, oi.version); } } else { dout(15) << "read_log missing " << *i << dendl; - missing.add(i->oid, i->version, eversion_t()); + missing.add(i->soid, i->version, eversion_t()); } } } @@ -1864,13 +1863,13 @@ coll_t PG::make_snap_collection(ObjectStore::Transaction& t, snapid_t s) // bool PG::block_if_wrlocked(MOSDOp* op, object_info_t& oi) { - pobject_t poid(info.pgid.pool(), 0, op->get_oid()); + sobject_t soid(op->get_oid(), CEPH_NOSNAP); if (oi.wrlock_by.tid && oi.wrlock_by.name != op->get_orig_source()) { //the object is locked for writing by someone else -- add the op to the waiting queue dout(10) << "blocked on wrlock on " << oi << dendl; - waiting_for_wr_unlock[poid.oid].push_back(op); + waiting_for_wr_unlock[soid].push_back(op); return true; } @@ -2000,11 +1999,11 @@ void PG::repair_object(ScrubMap::object *po, int bad_peer, int ok_peer) bv.push_back(po->attrs["oi"]); object_info_t oi(bv); if (bad_peer != acting[0]) { - peer_missing[bad_peer].add(po->poid.oid, oi.version, eversion_t()); + peer_missing[bad_peer].add(po->poid, oi.version, eversion_t()); } else { - missing.add(po->poid.oid, oi.version, eversion_t()); - missing_loc[po->poid.oid].insert(ok_peer); - log.last_requested = object_t(); + missing.add(po->poid, oi.version, eversion_t()); + missing_loc[po->poid].insert(ok_peer); + log.last_requested = sobject_t(); } uptodate_set.erase(bad_peer); osd->queue_for_recovery(this); diff --git a/src/osd/PG.h b/src/osd/PG.h index 73d46a5caf465..10009c7577d0a 100644 --- a/src/osd/PG.h +++ b/src/osd/PG.h @@ -191,17 +191,18 @@ public: const static int BACKLOG = 4; // event invented by generate_backlog __s32 op; // write, zero, trunc, remove - object_t oid; + sobject_t soid; + snapid_t snap; eversion_t version, prior_version; osd_reqid_t reqid; // caller+tid to uniquely identify request utime_t mtime; // this is the _user_ mtime, mind you bufferlist snaps; // only for clone entries Entry() : op(0) {} - Entry(int _op, object_t _oid, + Entry(int _op, sobject_t _soid, const eversion_t& v, const eversion_t& pv, const osd_reqid_t& rid, const utime_t& mt) : - op(_op), oid(_oid), version(v), + op(_op), soid(_soid), version(v), prior_version(pv), reqid(rid), mtime(mt) {} @@ -213,7 +214,7 @@ public: void encode(bufferlist &bl) const { ::encode(op, bl); - ::encode(oid, bl); + ::encode(soid, bl); ::encode(version, bl); ::encode(prior_version, bl); ::encode(reqid, bl); @@ -223,7 +224,7 @@ public: } void decode(bufferlist::iterator &bl) { ::decode(op, bl); - ::decode(oid, bl); + ::decode(soid, bl); ::decode(version, bl); ::decode(prior_version, bl); ::decode(reqid, bl); @@ -289,12 +290,12 @@ public: * plus some methods to manipulate it all. */ struct IndexedLog : public Log { - hash_map objects; // ptrs into log. be careful! - hash_set caller_ops; + hash_map objects; // ptrs into log. be careful! + hash_set caller_ops; // recovery pointers list::iterator complete_to; // not inclusive of referenced item - object_t last_requested; // last object requested by primary + sobject_t last_requested; // last object requested by primary /****/ IndexedLog() {} @@ -307,10 +308,10 @@ public: } void reset_recovery_pointers() { complete_to = log.end(); - last_requested = object_t(); + last_requested = sobject_t(); } - bool logged_object(object_t oid) const { + bool logged_object(sobject_t oid) const { return objects.count(oid); } bool logged_req(const osd_reqid_t &r) const { @@ -323,15 +324,15 @@ public: for (list::iterator i = log.begin(); i != log.end(); i++) { - objects[i->oid] = &(*i); + objects[i->soid] = &(*i); caller_ops.insert(i->reqid); } } void index(Entry& e) { - if (objects.count(e.oid) == 0 || - objects[e.oid]->version < e.version) - objects[e.oid] = &e; + if (objects.count(e.soid) == 0 || + objects[e.soid]->version < e.version) + objects[e.soid] = &e; caller_ops.insert(e.reqid); } void unindex() { @@ -340,19 +341,19 @@ public: } void unindex(Entry& e) { // NOTE: this only works if we remove from the _bottom_ of the log! - assert(objects.count(e.oid)); - if (objects[e.oid]->version == e.version) - objects.erase(e.oid); + assert(objects.count(e.soid)); + if (objects[e.soid]->version == e.version) + objects.erase(e.soid); caller_ops.erase(e.reqid); } // accessors - Entry *is_updated(object_t oid) { + Entry *is_updated(sobject_t oid) { if (objects.count(oid) && objects[oid]->is_update()) return objects[oid]; return 0; } - Entry *is_deleted(object_t oid) { + Entry *is_deleted(sobject_t oid) { if (objects.count(oid) && objects[oid]->is_delete()) return objects[oid]; return 0; } @@ -366,7 +367,7 @@ public: top = e.version; // to our index - objects[e.oid] = &(log.back()); + objects[e.soid] = &(log.back()); caller_ops.insert(e.reqid); } @@ -426,8 +427,8 @@ public: }; WRITE_CLASS_ENCODER(item) - map missing; // oid -> (need v, have v) - map rmissing; // v -> oid + map missing; // oid -> (need v, have v) + map rmissing; // v -> oid unsigned num_missing() const { return missing.size(); } @@ -436,13 +437,13 @@ public: rmissing.swap(o.rmissing); } - bool is_missing(object_t oid) { + bool is_missing(sobject_t oid) { return missing.count(oid); } - bool is_missing(object_t oid, eversion_t v) { + bool is_missing(sobject_t oid, eversion_t v) { return missing.count(oid) && missing[oid].need <= v; } - eversion_t have_old(object_t oid) { + eversion_t have_old(sobject_t oid) { return missing.count(oid) ? missing[oid].have : eversion_t(); } @@ -454,41 +455,41 @@ public: if (e.is_update()) { if (e.prior_version == eversion_t()) { // new object. - //assert(missing.count(e.oid) == 0); // might already be missing divergent item. - if (missing.count(e.oid)) // already missing divergent item - rmissing.erase(missing[e.oid].need); - missing[e.oid] = item(e.version, eversion_t()); // .have = nil - } else if (missing.count(e.oid)) { + //assert(missing.count(e.soid) == 0); // might already be missing divergent item. + if (missing.count(e.soid)) // already missing divergent item + rmissing.erase(missing[e.soid].need); + missing[e.soid] = item(e.version, eversion_t()); // .have = nil + } else if (missing.count(e.soid)) { // already missing (prior). - assert(missing[e.oid].need == e.prior_version); + assert(missing[e.soid].need == e.prior_version); rmissing.erase(e.prior_version); - missing[e.oid].need = e.version; // .have unchanged. + missing[e.soid].need = e.version; // .have unchanged. } else { // not missing, we must have prior_version (if any) - missing[e.oid] = item(e.version, e.prior_version); + missing[e.soid] = item(e.version, e.prior_version); } - rmissing[e.version] = e.oid; + rmissing[e.version] = e.soid; } else - rm(e.oid, e.version); + rm(e.soid, e.version); } void add_event(Log::Entry& e) { if (e.is_update()) { - if (missing.count(e.oid)) { - if (missing[e.oid].need >= e.version) + if (missing.count(e.soid)) { + if (missing[e.soid].need >= e.version) return; // already missing same or newer. // missing older, revise need - rmissing.erase(missing[e.oid].need); - missing[e.oid].need = e.version; + rmissing.erase(missing[e.soid].need); + missing[e.soid].need = e.version; } else // not missing => have prior_version (if any) - missing[e.oid] = item(e.version, e.prior_version); - rmissing[e.version] = e.oid; + missing[e.soid] = item(e.version, e.prior_version); + rmissing[e.version] = e.soid; } else - rm(e.oid, e.version); + rm(e.soid, e.version); } - void revise_need(object_t oid, eversion_t need) { + void revise_need(sobject_t oid, eversion_t need) { if (missing.count(oid)) { rmissing.erase(missing[oid].need); missing[oid].need = need; // no not adjust .have @@ -498,18 +499,18 @@ public: rmissing[need] = oid; } - void add(object_t oid, eversion_t need, eversion_t have) { + void add(sobject_t oid, eversion_t need, eversion_t have) { missing[oid] = item(need, have); rmissing[need] = oid; } - void rm(object_t oid, eversion_t when) { + void rm(sobject_t oid, eversion_t when) { if (missing.count(oid) && missing[oid].need < when) { rmissing.erase(missing[oid].need); missing.erase(oid); } } - void got(object_t oid, eversion_t v) { + void got(sobject_t oid, eversion_t v) { assert(missing.count(oid)); assert(missing[oid].need <= v); rmissing.erase(missing[oid].need); @@ -522,7 +523,7 @@ public: void decode(bufferlist::iterator &bl) { ::decode(missing, bl); - for (map::iterator it = missing.begin(); + for (map::iterator it = missing.begin(); it != missing.end(); it++) rmissing[it->second.need] = it->first; @@ -611,7 +612,7 @@ public: IndexedLog log; OndiskLog ondisklog; Missing missing; - map > missing_loc; + map > missing_loc; set snap_collections; map past_intervals; @@ -653,17 +654,17 @@ protected: // pg waiters list waiting_for_active; - hash_map > waiting_for_missing_object; map replay_queue; - hash_map > waiting_for_wr_unlock; + hash_map > waiting_for_wr_unlock; bool block_if_wrlocked(MOSDOp* op, object_info_t& oi); // stats - hash_map stat_object_temp_rd; + hash_map stat_object_temp_rd; Mutex pg_stats_lock; bool pg_stats_valid; @@ -840,8 +841,8 @@ public: virtual bool same_for_rep_modify_since(epoch_t e) = 0; virtual bool is_write_in_progress() = 0; - virtual bool is_missing_object(object_t oid) = 0; - virtual void wait_for_missing_object(object_t oid, Message *op) = 0; + virtual bool is_missing_object(sobject_t oid) = 0; + virtual void wait_for_missing_object(sobject_t oid, Message *op) = 0; virtual void on_osd_failure(int osd) = 0; virtual void on_role_change() = 0; @@ -892,7 +893,7 @@ inline ostream& operator<<(ostream& out, const PG::Log::Entry& e) (e.is_modify() ? " m ": (e.is_backlog() ? " b ": " ? ")))) - << e.oid << " by " << e.reqid << " " << e.mtime; + << e.soid << " by " << e.reqid << " " << e.mtime; } inline ostream& operator<<(ostream& out, const PG::Log& log) diff --git a/src/osd/ReplicatedPG.cc b/src/osd/ReplicatedPG.cc index 158d1aa0960c0..c57a9c6b7e1cf 100644 --- a/src/osd/ReplicatedPG.cc +++ b/src/osd/ReplicatedPG.cc @@ -70,36 +70,34 @@ bool ReplicatedPG::same_for_rep_modify_since(epoch_t e) // ==================== // missing objects -bool ReplicatedPG::is_missing_object(object_t oid) +bool ReplicatedPG::is_missing_object(sobject_t soid) { - return missing.missing.count(oid); + return missing.missing.count(soid); } -void ReplicatedPG::wait_for_missing_object(object_t oid, Message *m) +void ReplicatedPG::wait_for_missing_object(sobject_t soid, Message *m) { - assert(is_missing_object(oid)); - - pobject_t poid(info.pgid.pool(), 0, oid); + assert(is_missing_object(soid)); // we don't have it (yet). - eversion_t v = missing.missing[oid].need; - if (pulling.count(oid)) { + eversion_t v = missing.missing[soid].need; + if (pulling.count(soid)) { dout(7) << "missing " - << poid + << soid << " v " << v << ", already pulling" << dendl; } else { dout(7) << "missing " - << poid + << soid << " v " << v << ", pulling" << dendl; - pull(poid); + pull(soid); osd->start_recovery_op(this, 1); } - waiting_for_missing_object[oid].push_back(m); + waiting_for_missing_object[soid].push_back(m); } @@ -118,7 +116,7 @@ bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) ceph_osd_op& readop = op->ops[0]; object_t oid = op->get_oid(); - pobject_t poid(info.pgid.pool(), 0, oid); + sobject_t soid(oid, op->get_snapid()); // -- load balance reads -- if (is_primary()) { @@ -135,6 +133,7 @@ bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) } } +#if 0 // -- balance reads? if (g_conf.osd_balance_reads && !op->get_source().is_osd()) { @@ -147,8 +146,8 @@ bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) // hot? double temp = 0; - if (stat_object_temp_rd.count(oid)) - temp = stat_object_temp_rd[oid].get(op->get_recv_stamp()); + if (stat_object_temp_rd.count(soid)) + temp = stat_object_temp_rd[soid].get(op->get_recv_stamp()); bool is_hotly_read = temp > g_conf.osd_balance_reads_temp; dout(20) << "balance_reads oid " << oid << " temp " << temp @@ -161,13 +160,13 @@ bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) bool b; // *** FIXME *** this may block, and we're in the fast path! *** if (g_conf.osd_balance_reads && - osd->store->getattr(info.pgid.to_coll(), poid, "balance-reads", &b, 1) >= 0) + osd->store->getattr(info.pgid.to_coll(), soid, "balance-reads", &b, 1) >= 0) is_balanced = true; if (!is_balanced && should_balance && - balancing_reads.count(oid) == 0) { + balancing_reads.count(soid) == 0) { dout(-10) << "preprocess_op balance-reads on " << oid << dendl; - balancing_reads.insert(oid); + balancing_reads.insert(soid); ceph_object_layout layout; layout.ol_pgid = info.pgid.u.pg64; layout.ol_stripe_unit = 0; @@ -180,9 +179,9 @@ bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) do_op(pop); } if (is_balanced && !should_balance && - !unbalancing_reads.count(oid) == 0) { + !unbalancing_reads.count(soid) == 0) { dout(-10) << "preprocess_op unbalance-reads on " << oid << dendl; - unbalancing_reads.insert(oid); + unbalancing_reads.insert(soid); ceph_object_layout layout; layout.ol_pgid = info.pgid.u.pg64; layout.ol_stripe_unit = 0; @@ -195,6 +194,7 @@ bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) do_op(pop); } } +#endif // -- read shedding if (g_conf.osd_shed_reads && @@ -340,13 +340,13 @@ bool ReplicatedPG::preprocess_op(MOSDOp *op, utime_t now) // -- fastpath read? // if this is a read and the data is in the cache, do an immediate read.. if ( g_conf.osd_immediate_read_from_cache ) { - if (osd->store->is_cached(info.pgid.to_coll(), poid, + if (osd->store->is_cached(info.pgid.to_coll(), soid, readop.offset, readop.length) == 0) { if (!is_primary() && !op->get_source().is_osd()) { // am i allowed? bool v; - if (osd->store->getattr(info.pgid.to_coll(), poid, "balance-reads", &v, 1) < 0) { + if (osd->store->getattr(info.pgid.to_coll(), soid, "balance-reads", &v, 1) < 0) { dout(-10) << "preprocess_op in-cache but no balance-reads on " << oid << ", fwd to primary" << dendl; osd->messenger->forward_message(op, osd->osdmap->get_inst(get_primary())); @@ -443,7 +443,7 @@ bool ReplicatedPG::snap_trimmer() dout(10) << "snap_trimmer collection " << c << " has " << ls.size() << " items" << dendl; for (vector::iterator p = ls.begin(); p != ls.end(); p++) { - pobject_t coid = *p; + sobject_t coid = *p; ObjectStore::Transaction t; @@ -453,8 +453,8 @@ bool ReplicatedPG::snap_trimmer() object_info_t coi(bl); // load head info - pobject_t head = coid; - head.oid.snap = CEPH_NOSNAP; + sobject_t head = coid; + head.snap = CEPH_NOSNAP; bl.clear(); osd->store->getattr(info.pgid.to_coll(), head, OI_ATTR, bl); object_info_t hoi(bl); @@ -483,7 +483,7 @@ bool ReplicatedPG::snap_trimmer() t.collection_remove(info.pgid.to_snap_coll(snaps[snaps.size()-1]), coid); // ...from snapset - snapid_t last = coid.oid.snap; + snapid_t last = coid.snap; vector::iterator p; for (p = snapset.clones.begin(); p != snapset.clones.end(); p++) if (*p == last) @@ -573,10 +573,10 @@ bool ReplicatedPG::snap_trimmer() /* * return false if object doesn't (logically) exist */ -int ReplicatedPG::pick_read_snap(pobject_t& poid, object_info_t& coi) +int ReplicatedPG::pick_read_snap(sobject_t& soid, object_info_t& coi) { - pobject_t head = poid; - head.oid.snap = CEPH_NOSNAP; + sobject_t head = soid; + head.snap = CEPH_NOSNAP; bufferlist bl; int r = osd->store->getattr(info.pgid.to_coll(), head, OI_ATTR, bl); @@ -584,16 +584,18 @@ int ReplicatedPG::pick_read_snap(pobject_t& poid, object_info_t& coi) return -ENOENT; // if head doesn't exist, no snapped version will either. object_info_t hoi(bl); - dout(10) << "pick_read_snap " << poid << " snapset " << hoi.snapset << dendl; - snapid_t want = poid.oid.snap; + dout(10) << "pick_read_snap " << soid << " snapset " << hoi.snapset << dendl; + + snapid_t want = soid.snap; // head? - if (want > hoi.snapset.seq) { + if (soid.snap > hoi.snapset.seq) { if (hoi.snapset.head_exists) { dout(10) << "pick_read_snap " << head << " want " << want << " > snapset seq " << hoi.snapset.seq << " -- HIT" << dendl; - poid = head; + soid = head; + coi = hoi; return 0; } else { dout(10) << "pick_read_snap " << head @@ -613,22 +615,26 @@ int ReplicatedPG::pick_read_snap(pobject_t& poid, object_info_t& coi) } // check clone - poid.oid.snap = hoi.snapset.clones[k]; - - if (missing.is_missing(poid.oid)) { - dout(20) << "pick_read_snap " << poid << " missing, try again later" << dendl; + soid.snap = hoi.snapset.clones[k]; + if (missing.is_missing(soid)) { + dout(20) << "pick_read_snap " << soid << " missing, try again later" << dendl; return -EAGAIN; } + // load clone info + bl.clear(); + osd->store->getattr(info.pgid.to_coll(), soid, OI_ATTR, bl); + coi.decode(bl); + // clone - dout(20) << "pick_read_snap " << poid << " snaps " << coi.snaps << dendl; + dout(20) << "pick_read_snap " << soid << " snaps " << coi.snaps << dendl; snapid_t first = coi.snaps[coi.snaps.size()-1]; snapid_t last = coi.snaps[0]; if (first <= want) { - dout(20) << "pick_read_snap " << poid << " [" << first << "," << last << "] contains " << want << " -- HIT" << dendl; + dout(20) << "pick_read_snap " << soid << " [" << first << "," << last << "] contains " << want << " -- HIT" << dendl; return 0; } else { - dout(20) << "pick_read_snap " << poid << " [" << first << "," << last << "] does not contain " << want << " -- DNE" << dendl; + dout(20) << "pick_read_snap " << soid << " [" << first << "," << last << "] does not contain " << want << " -- DNE" << dendl; return -ENOENT; } } @@ -637,28 +643,33 @@ int ReplicatedPG::pick_read_snap(pobject_t& poid, object_info_t& coi) void ReplicatedPG::op_read(MOSDOp *op) { object_t oid = op->get_oid(); - pobject_t poid(info.pgid.pool(), 0, oid); + sobject_t soid(oid, op->get_snapid()); - dout(10) << "op_read " << oid << " " << op->ops << dendl; - - object_info_t oi(poid); + dout(10) << "op_read " << soid << " " << op->ops << dendl; - bufferlist bv; - osd->store->getattr(info.pgid.to_coll(), poid, OI_ATTR, bv); - if (bv.length()) - oi.decode(bv); + bufferlist::iterator bp = op->get_data().begin(); + bufferlist data; + int data_off = 0; + int result = 0; + + // pick revision + object_info_t oi(soid); + if (soid.snap) { + result = pick_read_snap(soid, oi); + if (result == -EAGAIN) { + wait_for_missing_object(soid, op); + return; + } + if (result != 0) + goto done; // we have no revision for this request. + } // wrlocked? - if (poid.oid.snap == CEPH_NOSNAP && + if (op->get_snapid() == CEPH_NOSNAP && block_if_wrlocked(op, oi)) return; - bufferlist::iterator bp = op->get_data().begin(); - bufferlist data; - int data_off = 0; - int result = 0; - // !primary and unbalanced? // (ignore ops forwarded from the primary) if (!is_primary()) { @@ -695,9 +706,9 @@ void ReplicatedPG::op_read(MOSDOp *op) } else { // make sure i exist and am balanced, otherwise fw back to acker. bool b; - if (!osd->store->exists(info.pgid.to_coll(), poid) || - osd->store->getattr(info.pgid.to_coll(), poid, "balance-reads", &b, 1) < 0) { - dout(-10) << "read on replica, object " << poid + if (!osd->store->exists(info.pgid.to_coll(), soid) || + osd->store->getattr(info.pgid.to_coll(), soid, "balance-reads", &b, 1) < 0) { + dout(-10) << "read on replica, object " << soid << " dne or no balance-reads, fw back to primary" << dendl; osd->messenger->forward_message(op, osd->osdmap->get_inst(get_primary())); return; @@ -706,23 +717,13 @@ void ReplicatedPG::op_read(MOSDOp *op) } // do it. - if (poid.oid.snap) { - result = pick_read_snap(poid, oi); - if (result == -EAGAIN) { - wait_for_missing_object(poid.oid, op); - return; - } - if (result != 0) - goto done; // we have no revision for this request. - } - for (vector::iterator p = op->ops.begin(); p != op->ops.end(); p++) { switch (p->op) { case CEPH_OSD_OP_READ: { // read into a buffer bufferlist bl; - int r = osd->store->read(info.pgid.to_coll(), poid, p->offset, p->length, bl); + int r = osd->store->read(info.pgid.to_coll(), soid, p->offset, p->length, bl); if (data.length() == 0) data_off = p->offset; data.claim(bl); @@ -742,7 +743,7 @@ void ReplicatedPG::op_read(MOSDOp *op) { struct stat st; memset(&st, sizeof(st), 0); - int r = osd->store->stat(info.pgid.to_coll(), poid, &st); + int r = osd->store->stat(info.pgid.to_coll(), soid, &st); if (r >= 0) p->length = st.st_size; else @@ -755,7 +756,7 @@ void ReplicatedPG::op_read(MOSDOp *op) nstring name(p->name_len + 1); name[0] = '_'; bp.copy(p->name_len, name.data()+1); - int r = osd->store->getattr(info.pgid.to_coll(), poid, name.c_str(), data); + int r = osd->store->getattr(info.pgid.to_coll(), soid, name.c_str(), data); if (r >= 0) { p->value_len = r; result = 0; @@ -862,7 +863,7 @@ void ReplicatedPG::op_read(MOSDOp *op) if (is_primary() && g_conf.osd_balance_reads) - stat_object_temp_rd[oid].hit(now); // hit temp. + stat_object_temp_rd[soid].hit(now); // hit temp. } osd->messenger->send_message(reply, op->get_orig_source_inst()); @@ -881,7 +882,7 @@ void ReplicatedPG::op_read(MOSDOp *op) void ReplicatedPG::_make_clone(ObjectStore::Transaction& t, - pobject_t head, pobject_t coid, + sobject_t head, pobject_t coid, eversion_t ov, eversion_t v, osd_reqid_t& reqid, utime_t mtime, vector& snaps) { object_info_t pi(coid); @@ -898,11 +899,11 @@ void ReplicatedPG::_make_clone(ObjectStore::Transaction& t, } void ReplicatedPG::prepare_clone(ObjectStore::Transaction& t, bufferlist& logbl, osd_reqid_t reqid, pg_stat_t& stats, - pobject_t poid, loff_t old_size, object_info_t& oi, + sobject_t soid, loff_t old_size, object_info_t& oi, eversion_t& at_version, SnapContext& snapc) { // clone? - assert(poid.oid.snap == CEPH_NOSNAP); + assert(soid.snap == CEPH_NOSNAP); dout(20) << "snapset=" << oi.snapset << " snapc=" << snapc << dendl;; // use newer snapc? @@ -916,8 +917,8 @@ void ReplicatedPG::prepare_clone(ObjectStore::Transaction& t, bufferlist& logbl, snapc.snaps.size() && // there are snaps snapc.snaps[0] > oi.snapset.seq) { // existing object is old // clone - pobject_t coid = poid; - coid.oid.snap = snapc.seq; + sobject_t coid = soid; + coid.snap = snapc.seq; unsigned l; for (l=1; l oi.snapset.seq; l++) ; @@ -927,7 +928,7 @@ void ReplicatedPG::prepare_clone(ObjectStore::Transaction& t, bufferlist& logbl, snaps[i] = snapc.snaps[i]; // prepare clone - _make_clone(t, poid, coid, oi.version, at_version, reqid, oi.mtime, snaps); + _make_clone(t, soid, coid, oi.version, at_version, reqid, oi.mtime, snaps); // add to snap bound collections coll_t fc = make_snap_collection(t, snaps[0]); @@ -939,15 +940,15 @@ void ReplicatedPG::prepare_clone(ObjectStore::Transaction& t, bufferlist& logbl, stats.num_objects++; stats.num_object_clones++; - oi.snapset.clones.push_back(coid.oid.snap); - oi.snapset.clone_size[coid.oid.snap] = old_size; - oi.snapset.clone_overlap[coid.oid.snap].insert(0, old_size); + oi.snapset.clones.push_back(coid.snap); + oi.snapset.clone_size[coid.snap] = old_size; + oi.snapset.clone_overlap[coid.snap].insert(0, old_size); // log clone dout(10) << "cloning v " << oi.version << " to " << coid << " v " << at_version << " snaps=" << snaps << dendl; - Log::Entry cloneentry(PG::Log::Entry::CLONE, coid.oid, at_version, oi.version, reqid, oi.mtime); + Log::Entry cloneentry(PG::Log::Entry::CLONE, coid, at_version, oi.version, reqid, oi.mtime); ::encode(snaps, cloneentry.snaps); add_log_entry(cloneentry, logbl); @@ -970,7 +971,7 @@ void ReplicatedPG::add_interval_usage(interval_set<__u64>& s, pg_stat_t& stats) // low level object operations int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t reqid, pg_stat_t& st, - pobject_t poid, __u64& old_size, bool& exists, object_info_t& oi, + sobject_t soid, __u64& old_size, bool& exists, object_info_t& oi, vector& ops, int opn, bufferlist::iterator& bp, SnapContext& snapc) { @@ -1019,12 +1020,12 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req case CEPH_OSD_OP_BALANCEREADS: { bool bal = true; - t.setattr(info.pgid.to_coll(), poid, "balance-reads", &bal, sizeof(bal)); + t.setattr(info.pgid.to_coll(), soid, "balance-reads", &bal, sizeof(bal)); } break; case CEPH_OSD_OP_UNBALANCEREADS: { - t.rmattr(info.pgid.to_coll(), poid, "balance-reads"); + t.rmattr(info.pgid.to_coll(), soid, "balance-reads"); } break; @@ -1036,7 +1037,7 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req assert(op.length); bufferlist nbl; bp.copy(op.length, nbl); - t.write(info.pgid.to_coll(), poid, op.offset, op.length, nbl); + t.write(info.pgid.to_coll(), soid, op.offset, op.length, nbl); if (oi.snapset.clones.size()) { snapid_t newest = *oi.snapset.clones.rbegin(); interval_set<__u64> ch; @@ -1059,8 +1060,8 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req { // write full object bufferlist nbl; bp.copy(op.length, nbl); - t.truncate(info.pgid.to_coll(), poid, 0); - t.write(info.pgid.to_coll(), poid, op.offset, op.length, nbl); + t.truncate(info.pgid.to_coll(), soid, 0); + t.write(info.pgid.to_coll(), soid, op.offset, op.length, nbl); if (oi.snapset.clones.size()) { snapid_t newest = *oi.snapset.clones.rbegin(); oi.snapset.clone_overlap.erase(newest); @@ -1081,8 +1082,8 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req { // zero assert(op.length); if (!exists) - t.touch(info.pgid.to_coll(), poid); - t.zero(info.pgid.to_coll(), poid, op.offset, op.length); + t.touch(info.pgid.to_coll(), soid); + t.zero(info.pgid.to_coll(), soid, op.offset, op.length); if (oi.snapset.clones.size()) { snapid_t newest = *oi.snapset.clones.rbegin(); interval_set<__u64> ch; @@ -1098,8 +1099,8 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req case CEPH_OSD_OP_TRUNCATE: { // truncate if (!exists) - t.touch(info.pgid.to_coll(), poid); - t.truncate(info.pgid.to_coll(), poid, op.offset); + t.touch(info.pgid.to_coll(), soid); + t.truncate(info.pgid.to_coll(), soid, op.offset); if (oi.snapset.clones.size()) { snapid_t newest = *oi.snapset.clones.rbegin(); interval_set<__u64> trim; @@ -1126,7 +1127,7 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req case CEPH_OSD_OP_DELETE: { // delete - t.remove(info.pgid.to_coll(), poid); + t.remove(info.pgid.to_coll(), soid); if (oi.snapset.clones.size()) { snapid_t newest = *oi.snapset.clones.rbegin(); add_interval_usage(oi.snapset.clone_overlap[newest], st); @@ -1149,15 +1150,15 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req case CEPH_OSD_OP_SETXATTR: { if (!exists) - t.touch(info.pgid.to_coll(), poid); + t.touch(info.pgid.to_coll(), soid); nstring name(op.name_len + 1); name[0] = '_'; bp.copy(op.name_len, name.data()+1); bufferlist bl; bp.copy(op.value_len, bl); if (!oi.snapset.head_exists) // create object if it doesn't yet exist. - t.touch(info.pgid.to_coll(), poid); - t.setattr(info.pgid.to_coll(), poid, name, bl); + t.touch(info.pgid.to_coll(), soid); + t.setattr(info.pgid.to_coll(), soid, name, bl); oi.snapset.head_exists = true; } break; @@ -1167,7 +1168,7 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req nstring name(op.name_len + 1); name[0] = '_'; bp.copy(op.name_len, name.data()+1); - t.rmattr(info.pgid.to_coll(), poid, name); + t.rmattr(info.pgid.to_coll(), soid, name); } break; @@ -1182,7 +1183,7 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req newop.op = CEPH_OSD_OP_WRITE; newop.offset = old_size; newop.length = op.length; - prepare_simple_op(t, reqid, st, poid, old_size, exists, oi, nops, 0, bp, snapc); + prepare_simple_op(t, reqid, st, soid, old_size, exists, oi, nops, 0, bp, snapc); } break; @@ -1236,7 +1237,7 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req newop.offset = op.truncate_size; dout(10) << " seq " << op.truncate_seq << " > old_seq " << old_seq << ", truncating with " << newop << dendl; - prepare_simple_op(t, reqid, st, poid, old_size, exists, oi, nops, 0, bp, snapc); + prepare_simple_op(t, reqid, st, soid, old_size, exists, oi, nops, 0, bp, snapc); } else { // do smart truncate interval_set<__u64> tm; @@ -1257,7 +1258,7 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req newop.op = CEPH_OSD_OP_ZERO; newop.offset = p->first; newop.length = p->second; - prepare_simple_op(t, reqid, st, poid, old_size, exists, oi, nops, 0, bp, snapc); + prepare_simple_op(t, reqid, st, soid, old_size, exists, oi, nops, 0, bp, snapc); } oi.truncate_info.clear(); @@ -1278,7 +1279,7 @@ int ReplicatedPG::prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t req } void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t reqid, - pobject_t poid, + sobject_t soid, vector& ops, bufferlist& bl, utime_t mtime, bool& exists, __u64& size, object_info_t& oi, eversion_t at_version, SnapContext& snapc, @@ -1295,13 +1296,13 @@ void ReplicatedPG::prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t bufferlist::iterator bp = bl.begin(); for (unsigned i=0; iop->get_oid(); + sobject_t soid(repop->op->get_oid(), CEPH_NOSNAP); ceph_osd_op& first = repop->op->ops[0]; switch (first.op) { +#if 0 case CEPH_OSD_OP_UNBALANCEREADS: dout(-10) << "apply_repop completed unbalance-reads on " << oid << dendl; unbalancing_reads.erase(oid); @@ -1425,12 +1427,13 @@ void ReplicatedPG::apply_repop(RepGather *repop) } */ break; - +#endif + case CEPH_OSD_OP_WRUNLOCK: - dout(-10) << "apply_repop completed wrunlock on " << oid << dendl; - if (waiting_for_wr_unlock.count(oid)) { - osd->take_waiters(waiting_for_wr_unlock[oid]); - waiting_for_wr_unlock.erase(oid); + dout(-10) << "apply_repop completed wrunlock on " << soid << dendl; + if (waiting_for_wr_unlock.count(soid)) { + osd->take_waiters(waiting_for_wr_unlock[soid]); + waiting_for_wr_unlock.erase(soid); } break; } @@ -1513,15 +1516,15 @@ void ReplicatedPG::eval_repop(RepGather *repop) void ReplicatedPG::issue_repop(RepGather *repop, int dest, utime_t now) { - pobject_t poid(info.pgid.pool(), 0, repop->op->get_oid()); + sobject_t soid(repop->op->get_oid(), CEPH_NOSNAP); dout(7) << " issue_repop rep_tid " << repop->rep_tid - << " o " << poid + << " o " << soid << " to osd" << dest << dendl; // forward the write/update/whatever int acks_wanted = CEPH_OSD_FLAG_ACK | CEPH_OSD_FLAG_ONDISK; - MOSDSubOp *wr = new MOSDSubOp(repop->op->get_reqid(), info.pgid, poid, + MOSDSubOp *wr = new MOSDSubOp(repop->op->get_reqid(), info.pgid, soid, repop->op->ops, repop->noop, acks_wanted, osd->osdmap->get_epoch(), repop->rep_tid, repop->at_version); @@ -1662,25 +1665,25 @@ void ReplicatedPG::put_projected_object(ProjectedObjectInfo *pinfo) void ReplicatedPG::op_modify(MOSDOp *op) { int whoami = osd->get_nodeid(); - pobject_t poid(info.pgid.pool(), 0, op->get_oid()); + sobject_t soid(op->get_oid(), CEPH_NOSNAP); // balance-reads set? #if 0 char v; if ((op->get_op() != CEPH_OSD_OP_BALANCEREADS && op->get_op() != CEPH_OSD_OP_UNBALANCEREADS) && - (osd->store->getattr(info.pgid.to_coll(), poid, "balance-reads", &v, 1) >= 0 || - balancing_reads.count(poid.oid))) { + (osd->store->getattr(info.pgid.to_coll(), soid, "balance-reads", &v, 1) >= 0 || + balancing_reads.count(soid.oid))) { - if (!unbalancing_reads.count(poid.oid)) { + if (!unbalancing_reads.count(soid.oid)) { // unbalance - dout(-10) << "preprocess_op unbalancing-reads on " << poid.oid << dendl; - unbalancing_reads.insert(poid.oid); + dout(-10) << "preprocess_op unbalancing-reads on " << soid.oid << dendl; + unbalancing_reads.insert(soid.oid); ceph_object_layout layout; layout.ol_pgid = info.pgid.u.pg64; layout.ol_stripe_unit = 0; MOSDOp *pop = new MOSDOp(0, osd->get_tid(), - poid.oid, + soid.oid, layout, osd->osdmap->get_epoch(), CEPH_OSD_OP_UNBALANCEREADS, 0); @@ -1688,14 +1691,14 @@ void ReplicatedPG::op_modify(MOSDOp *op) } // add to wait queue - dout(-10) << "preprocess_op waiting for unbalance-reads on " << poid.oid << dendl; - waiting_for_unbalanced_reads[poid.oid].push_back(op); + dout(-10) << "preprocess_op waiting for unbalance-reads on " << soid.oid << dendl; + waiting_for_unbalanced_reads[soid.oid].push_back(op); return; } #endif // get existing object info - ProjectedObjectInfo *pinfo = get_projected_object(poid); + ProjectedObjectInfo *pinfo = get_projected_object(soid); // --- locking --- @@ -1739,7 +1742,7 @@ void ReplicatedPG::op_modify(MOSDOp *op) op->set_version(at_version); dout(10) << "op_modify " << opname - << " " << poid.oid + << " " << soid << " ov " << pinfo->oi.version << " av " << at_version << " snapc " << snapc << " snapset " << pinfo->oi.snapset @@ -1749,7 +1752,7 @@ void ReplicatedPG::op_modify(MOSDOp *op) if ((op->get_flags() & CEPH_OSD_FLAG_ORDERSNAP) && snapc.seq < pinfo->oi.snapset.seq) { dout(10) << " ORDERSNAP flag set and snapc seq " << snapc.seq << " < snapset seq " << pinfo->oi.snapset.seq - << " on " << poid << dendl; + << " on " << soid << dendl; osd->reply_op_error(op, -EOLDSNAPC); return; } @@ -1758,10 +1761,10 @@ void ReplicatedPG::op_modify(MOSDOp *op) for (unsigned i=1; istart_recovery_op(this, 1); } } @@ -1790,7 +1793,7 @@ void ReplicatedPG::op_modify(MOSDOp *op) // we are acker. if (!noop) { // log and update later. - prepare_transaction(repop->t, op->get_reqid(), poid, op->ops, op->get_data(), repop->mtime, + prepare_transaction(repop->t, op->get_reqid(), soid, op->ops, op->get_data(), repop->mtime, pinfo->exists, pinfo->size, pinfo->oi, at_version, snapc, trim_to); @@ -1870,7 +1873,7 @@ public: void ReplicatedPG::sub_op_modify(MOSDSubOp *op) { - pobject_t poid = op->poid; + sobject_t soid = op->poid; const char *opname; if (op->noop) @@ -1879,7 +1882,7 @@ void ReplicatedPG::sub_op_modify(MOSDSubOp *op) opname = ceph_osd_op_name(op->ops[0].op); dout(10) << "sub_op_modify " << opname - << " " << poid + << " " << soid << " v " << op->version << dendl; @@ -1893,7 +1896,7 @@ void ReplicatedPG::sub_op_modify(MOSDSubOp *op) osd->take_peer_stat(fromosd, op->peer_stat); // we better not be missing this. - assert(!missing.is_missing(poid.oid)); + assert(!missing.is_missing(soid)); // prepare our transaction ObjectStore::Transaction t; @@ -1996,9 +1999,9 @@ void ReplicatedPG::calc_head_subsets(SnapSet& snapset, pobject_t head, for (int j=snapset.clones.size()-1; j>=0; j--) { pobject_t c = head; - c.oid.snap = snapset.clones[j]; + c.snap = snapset.clones[j]; prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); - if (!missing.is_missing(c.oid)) { + if (!missing.is_missing(c)) { dout(10) << "calc_head_subsets " << head << " has prev " << c << " overlap " << prev << dendl; clone_subsets[c] = prev; @@ -2019,19 +2022,19 @@ void ReplicatedPG::calc_head_subsets(SnapSet& snapset, pobject_t head, << " clone_subsets " << clone_subsets << dendl; } -void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, pobject_t poid, +void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, sobject_t soid, Missing& missing, interval_set<__u64>& data_subset, map >& clone_subsets) { - dout(10) << "calc_clone_subsets " << poid + dout(10) << "calc_clone_subsets " << soid << " clone_overlap " << snapset.clone_overlap << dendl; - __u64 size = snapset.clone_size[poid.oid.snap]; + __u64 size = snapset.clone_size[soid.snap]; unsigned i; for (i=0; i < snapset.clones.size(); i++) - if (snapset.clones[i] == poid.oid.snap) + if (snapset.clones[i] == soid.snap) break; // any overlap with next older clone? @@ -2040,17 +2043,17 @@ void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, pobject_t poid, if (size) prev.insert(0, size); for (int j=i-1; j>=0; j--) { - pobject_t c = poid; - c.oid.snap = snapset.clones[j]; + sobject_t c = soid; + c.snap = snapset.clones[j]; prev.intersection_of(snapset.clone_overlap[snapset.clones[j]]); - if (!missing.is_missing(c.oid)) { - dout(10) << "calc_clone_subsets " << poid << " has prev " << c + if (!missing.is_missing(c)) { + dout(10) << "calc_clone_subsets " << soid << " has prev " << c << " overlap " << prev << dendl; clone_subsets[c] = prev; cloning.union_of(prev); break; } - dout(10) << "calc_clone_subsets " << poid << " does not have prev " << c + dout(10) << "calc_clone_subsets " << soid << " does not have prev " << c << " overlap " << prev << dendl; } @@ -2059,17 +2062,17 @@ void ReplicatedPG::calc_clone_subsets(SnapSet& snapset, pobject_t poid, if (size) next.insert(0, size); for (unsigned j=i+1; j::iterator p = missing_loc[poid.oid].begin(); - p != missing_loc[poid.oid].end(); + assert(missing_loc.count(soid)); + for (set::iterator p = missing_loc[soid].begin(); + p != missing_loc[soid].end(); p++) { if (osd->osdmap->is_up(*p)) { fromosd = *p; @@ -2101,9 +2104,9 @@ bool ReplicatedPG::pull(pobject_t poid) } } - dout(7) << "pull " << poid + dout(7) << "pull " << soid << " v " << v - << " on osds " << missing_loc[poid.oid] + << " on osds " << missing_loc[soid] << " from osd" << fromosd << dendl; @@ -2114,13 +2117,13 @@ bool ReplicatedPG::pull(pobject_t poid) interval_set<__u64> data_subset; // is this a snapped object? if so, consult the snapset.. we may not need the entire object! - if (poid.oid.snap && poid.oid.snap < CEPH_NOSNAP) { - pobject_t head = poid; - head.oid.snap = CEPH_NOSNAP; + if (soid.snap < CEPH_NOSNAP) { + pobject_t head = soid; + head.snap = CEPH_NOSNAP; // do we have the head? - if (missing.is_missing(head.oid)) { - if (pulling.count(head.oid)) { + if (missing.is_missing(head)) { + if (pulling.count(head)) { dout(10) << " missing but already pulling head " << head << dendl; return false; } else { @@ -2135,7 +2138,7 @@ bool ReplicatedPG::pull(pobject_t poid) object_info_t oi(bl); dout(10) << " snapset " << oi.snapset << dendl; - calc_clone_subsets(oi.snapset, poid, missing, + calc_clone_subsets(oi.snapset, soid, missing, data_subset, clone_subsets); // FIXME: this may overestimate if we are pulling multiple clones in parallel... dout(10) << " pulling " << data_subset << ", will clone " << clone_subsets @@ -2150,7 +2153,7 @@ bool ReplicatedPG::pull(pobject_t poid) tid_t tid = osd->get_tid(); vector pull(1); pull[0].op = CEPH_OSD_OP_PULL; - MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, poid, pull, false, CEPH_OSD_FLAG_ACK, + MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, soid, pull, false, CEPH_OSD_FLAG_ACK, osd->osdmap->get_epoch(), tid, v); subop->data_subset.swap(data_subset); // do not include clone_subsets in pull request; we will recalculate this @@ -2159,9 +2162,9 @@ bool ReplicatedPG::pull(pobject_t poid) osd->messenger->send_message(subop, osd->osdmap->get_inst(fromosd)); // take note - assert(pulling.count(poid.oid) == 0); - pulling[poid.oid].first = v; - pulling[poid.oid].second = fromosd; + assert(pulling.count(soid) == 0); + pulling[soid].first = v; + pulling[soid].second = fromosd; return true; } @@ -2170,44 +2173,44 @@ bool ReplicatedPG::pull(pobject_t poid) * intelligently push an object to a replica. make use of existing * clones/heads and dup data ranges where possible. */ -void ReplicatedPG::push_to_replica(pobject_t poid, int peer) +void ReplicatedPG::push_to_replica(sobject_t soid, int peer) { - dout(10) << "push_to_replica " << poid << " osd" << peer << dendl; + dout(10) << "push_to_replica " << soid << " osd" << peer << dendl; // get size struct stat st; - int r = osd->store->stat(info.pgid.to_coll(), poid, &st); + int r = osd->store->stat(info.pgid.to_coll(), soid, &st); assert(r == 0); map > clone_subsets; interval_set<__u64> data_subset; bufferlist bv; - r = osd->store->getattr(info.pgid.to_coll(), poid, OI_ATTR, bv); + r = osd->store->getattr(info.pgid.to_coll(), soid, OI_ATTR, bv); assert(r >= 0); object_info_t oi(bv); // are we doing a clone on the replica? - if (poid.oid.snap && poid.oid.snap < CEPH_NOSNAP) { - pobject_t head = poid; - head.oid.snap = CEPH_NOSNAP; - if (peer_missing[peer].is_missing(head.oid) && - peer_missing[peer].have_old(head.oid) == oi.prior_version) { + if (soid.snap < CEPH_NOSNAP) { + pobject_t head = soid; + head.snap = CEPH_NOSNAP; + if (peer_missing[peer].is_missing(head) && + peer_missing[peer].have_old(head) == oi.prior_version) { dout(10) << "push_to_replica osd" << peer << " has correct old " << head << " v" << oi.prior_version - << ", pushing " << poid << " attrs as a clone op" << dendl; + << ", pushing " << soid << " attrs as a clone op" << dendl; interval_set<__u64> data_subset; map > clone_subsets; clone_subsets[head].insert(0, st.st_size); - push(poid, peer, data_subset, clone_subsets); + push(soid, peer, data_subset, clone_subsets); return; } // try to base push off of clones that succeed/preceed poid // we need the head (and current SnapSet) to do that. - if (missing.is_missing(head.oid)) { + if (missing.is_missing(head)) { dout(15) << "push_to_replica missing head " << head << ", pushing raw clone" << dendl; - return push(poid, peer); // no head. push manually. + return push(soid, peer); // no head. push manually. } bufferlist bl; @@ -2216,31 +2219,31 @@ void ReplicatedPG::push_to_replica(pobject_t poid, int peer) object_info_t hoi(bl); dout(15) << "push_to_replica head snapset is " << hoi.snapset << dendl; - calc_clone_subsets(hoi.snapset, poid, peer_missing[peer], + calc_clone_subsets(hoi.snapset, soid, peer_missing[peer], data_subset, clone_subsets); } else { // pushing head. // base this on partially on replica's clones? dout(15) << "push_to_replica head snapset is " << oi.snapset << dendl; - calc_head_subsets(oi.snapset, poid, peer_missing[peer], data_subset, clone_subsets); + calc_head_subsets(oi.snapset, soid, peer_missing[peer], data_subset, clone_subsets); } - dout(10) << "push_to_replica " << poid << " pushing " << data_subset + dout(10) << "push_to_replica " << soid << " pushing " << data_subset << " cloning " << clone_subsets << dendl; - push(poid, peer, data_subset, clone_subsets); + push(soid, peer, data_subset, clone_subsets); } /* * push - send object to a peer */ -void ReplicatedPG::push(pobject_t poid, int peer) +void ReplicatedPG::push(sobject_t soid, int peer) { interval_set<__u64> subset; map > clone_subsets; - push(poid, peer, subset, clone_subsets); + push(soid, peer, subset, clone_subsets); } -void ReplicatedPG::push(pobject_t poid, int peer, +void ReplicatedPG::push(sobject_t soid, int peer, interval_set<__u64> &data_subset, map >& clone_subsets) { @@ -2251,7 +2254,7 @@ void ReplicatedPG::push(pobject_t poid, int peer, if (data_subset.size() || clone_subsets.size()) { struct stat st; - int r = osd->store->stat(info.pgid.to_coll(), poid, &st); + int r = osd->store->stat(info.pgid.to_coll(), soid, &st); assert(r == 0); size = st.st_size; @@ -2259,22 +2262,22 @@ void ReplicatedPG::push(pobject_t poid, int peer, p != data_subset.m.end(); p++) { bufferlist bit; - osd->store->read(info.pgid.to_coll(), poid, p->first, p->second, bit); + osd->store->read(info.pgid.to_coll(), soid, p->first, p->second, bit); bl.claim_append(bit); } } else { - osd->store->read(info.pgid.to_coll(), poid, 0, 0, bl); + osd->store->read(info.pgid.to_coll(), soid, 0, 0, bl); size = bl.length(); } - osd->store->getattrs(info.pgid.to_coll(), poid, attrset); + osd->store->getattrs(info.pgid.to_coll(), soid, attrset); bufferlist bv; bv.push_back(attrset[OI_ATTR]); object_info_t oi(bv); // ok - dout(7) << "push " << poid << " v " << oi.version + dout(7) << "push " << soid << " v " << oi.version << " size " << size << " subset " << data_subset << " data " << bl.length() @@ -2290,7 +2293,7 @@ void ReplicatedPG::push(pobject_t poid, int peer, push[0].op = CEPH_OSD_OP_PUSH; push[0].offset = 0; push[0].length = size; - MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, poid, push, false, 0, + MOSDSubOp *subop = new MOSDSubOp(rid, info.pgid, soid, push, false, 0, osd->osdmap->get_epoch(), osd->get_tid(), oi.version); subop->data_subset.swap(data_subset); subop->clone_subsets.swap(clone_subsets); @@ -2299,8 +2302,8 @@ void ReplicatedPG::push(pobject_t poid, int peer, osd->messenger->send_message(subop, osd->osdmap->get_inst(peer)); if (is_primary()) { - peer_missing[peer].got(poid.oid, oi.version); - pushing[poid.oid].insert(peer); + peer_missing[peer].got(soid, oi.version); + pushing[soid].insert(peer); } } @@ -2309,11 +2312,11 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply) dout(10) << "sub_op_push_reply from " << reply->get_source() << " " << *reply << dendl; int peer = reply->get_source().num(); - pobject_t poid = reply->get_poid(); + sobject_t soid = reply->get_poid(); - if (pushing.count(poid.oid) && - pushing[poid.oid].count(peer)) { - pushing[poid.oid].erase(peer); + if (pushing.count(soid) && + pushing[soid].count(peer)) { + pushing[soid].erase(peer); if (peer_missing.count(peer) == 0 || peer_missing[peer].num_missing() == 0) @@ -2321,15 +2324,15 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply) update_stats(); - if (pushing[poid.oid].empty()) { - dout(10) << "pushed " << poid << " to all replicas" << dendl; + if (pushing[soid].empty()) { + dout(10) << "pushed " << soid << " to all replicas" << dendl; finish_recovery_op(); } else { - dout(10) << "pushed " << poid << ", still waiting for push ack from " - << pushing[poid.oid] << dendl; + dout(10) << "pushed " << soid << ", still waiting for push ack from " + << pushing[soid] << dendl; } } else { - dout(10) << "huh, i wasn't pushing " << poid << dendl; + dout(10) << "huh, i wasn't pushing " << soid << dendl; } delete reply; } @@ -2341,16 +2344,16 @@ void ReplicatedPG::sub_op_push_reply(MOSDSubOpReply *reply) */ void ReplicatedPG::sub_op_pull(MOSDSubOp *op) { - const pobject_t poid = op->poid; + const pobject_t soid = op->poid; const eversion_t v = op->version; - dout(7) << "op_pull " << poid << " v " << op->version + dout(7) << "op_pull " << soid << " v " << op->version << " from " << op->get_source() << dendl; assert(!is_primary()); // we should be a replica or stray. - push(poid, op->get_source().num(), op->data_subset, op->clone_subsets); + push(soid, op->get_source().num(), op->data_subset, op->clone_subsets); delete op; } @@ -2360,12 +2363,12 @@ void ReplicatedPG::sub_op_pull(MOSDSubOp *op) */ void ReplicatedPG::sub_op_push(MOSDSubOp *op) { - pobject_t poid = op->poid; + sobject_t soid = op->poid; eversion_t v = op->version; ceph_osd_op& push = op->ops[0]; dout(7) << "op_push " - << poid + << soid << " v " << v << " len " << push.length << " data_subset " << op->data_subset @@ -2379,8 +2382,8 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) // are we missing (this specific version)? // (if version is wrong, it is either old (we don't want it) or // newer (peering is buggy)) - if (!missing.is_missing(poid.oid, v)) { - dout(7) << "sub_op_push not missing " << poid << " v" << v << dendl; + if (!missing.is_missing(soid, v)) { + dout(7) << "sub_op_push not missing " << soid << " v" << v << dendl; dout(15) << " but i AM missing " << missing.missing << dendl; delete op; return; @@ -2396,11 +2399,11 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) clone_subsets = op->clone_subsets; if (is_primary()) { - if (poid.oid.snap && poid.oid.snap < CEPH_NOSNAP) { + if (soid.snap < CEPH_NOSNAP) { // clone. make sure we have enough data. - pobject_t head = poid; - head.oid.snap = CEPH_NOSNAP; - assert(!missing.is_missing(head.oid)); + pobject_t head = soid; + head.snap = CEPH_NOSNAP; + assert(!missing.is_missing(head)); bufferlist bl; int r = osd->store->getattr(info.pgid.to_coll(), head, OI_ATTR, bl); @@ -2410,11 +2413,11 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) clone_subsets.clear(); // forget what pusher said; recalculate cloning. interval_set<__u64> data_needed; - calc_clone_subsets(hoi.snapset, poid, missing, data_needed, clone_subsets); + calc_clone_subsets(hoi.snapset, soid, missing, data_needed, clone_subsets); dout(10) << "sub_op_push need " << data_needed << ", got " << data_subset << dendl; if (!data_needed.subset_of(data_subset)) { - dout(0) << " we did not get enough of " << poid << " object data" << dendl; + dout(0) << " we did not get enough of " << soid << " object data" << dendl; delete op; return; } @@ -2459,7 +2462,7 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) // write object and add it to the PG ObjectStore::Transaction t; - t.remove(info.pgid.to_coll(), poid); // in case old version exists + t.remove(info.pgid.to_coll(), soid); // in case old version exists __u64 boff = 0; for (map >::iterator p = clone_subsets.begin(); @@ -2469,43 +2472,43 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) q != p->second.m.end(); q++) { dout(15) << " clone_range " << p->first << " " << q->first << "~" << q->second << dendl; - t.clone_range(info.pgid.to_coll(), poid, p->first, q->first, q->second); + t.clone_range(info.pgid.to_coll(), soid, p->first, q->first, q->second); } for (map<__u64,__u64>::iterator p = data_subset.m.begin(); p != data_subset.m.end(); p++) { bufferlist bit; bit.substr_of(data, boff, p->second); - t.write(info.pgid.to_coll(), poid, p->first, p->second, bit); + t.write(info.pgid.to_coll(), soid, p->first, p->second, bit); dout(15) << " write " << p->first << "~" << p->second << dendl; boff += p->second; } if (data_subset.empty()) - t.touch(info.pgid.to_coll(), poid); + t.touch(info.pgid.to_coll(), soid); - t.setattrs(info.pgid.to_coll(), poid, op->attrset); - if (poid.oid.snap && poid.oid.snap != CEPH_NOSNAP && + t.setattrs(info.pgid.to_coll(), soid, op->attrset); + if (soid.snap != CEPH_NOSNAP && op->attrset.count(OI_ATTR)) { bufferlist bl; bl.push_back(op->attrset[OI_ATTR]); object_info_t oi(bl); if (oi.snaps.size()) { coll_t lc = make_snap_collection(t, oi.snaps[0]); - t.collection_add(lc, info.pgid.to_coll(), poid); + t.collection_add(lc, info.pgid.to_coll(), soid); if (oi.snaps.size() > 1) { coll_t hc = make_snap_collection(t, oi.snaps[oi.snaps.size()-1]); - t.collection_add(hc, info.pgid.to_coll(), poid); + t.collection_add(hc, info.pgid.to_coll(), soid); } } } - missing.got(poid.oid, v); + missing.got(soid, v); // raise last_complete? while (log.complete_to != log.log.end()) { - if (missing.missing.count(log.complete_to->oid)) + if (missing.missing.count(log.complete_to->soid)) break; if (info.last_complete < log.complete_to->version) info.last_complete = log.complete_to->version; @@ -2523,11 +2526,11 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) if (is_primary()) { - missing_loc.erase(poid.oid); + missing_loc.erase(soid); // close out pull op? - if (pulling.count(poid.oid)) { - pulling.erase(poid.oid); + if (pulling.count(soid)) { + pulling.erase(soid); finish_recovery_op(); } @@ -2542,8 +2545,8 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) for (unsigned i=1; istart_recovery_op(this, 1); } } @@ -2558,9 +2561,9 @@ void ReplicatedPG::sub_op_push(MOSDSubOp *op) delete op; // kick waiters - if (waiting_for_missing_object.count(poid.oid)) { - osd->take_waiters(waiting_for_missing_object[poid.oid]); - waiting_for_missing_object.erase(poid.oid); + if (waiting_for_missing_object.count(soid)) { + osd->take_waiters(waiting_for_missing_object[soid]); + waiting_for_missing_object.erase(soid); } } @@ -2623,7 +2626,7 @@ void ReplicatedPG::on_role_change() dout(10) << "on_role_change" << dendl; // take object waiters - for (hash_map >::iterator it = waiting_for_missing_object.begin(); + for (hash_map >::iterator it = waiting_for_missing_object.begin(); it != waiting_for_missing_object.end(); it++) osd->take_waiters(it->second); @@ -2702,50 +2705,50 @@ int ReplicatedPG::recover_primary(int max) int started = 0; int skipped = 0; - map::iterator p = missing.missing.lower_bound(log.last_requested); + map::iterator p = missing.missing.lower_bound(log.last_requested); while (p != missing.missing.end()) { assert(log.objects.count(p->first)); latest = log.objects[p->first]; assert(latest); - pobject_t poid(info.pgid.pool(), 0, latest->oid); - pobject_t head = poid; - head.oid.snap = CEPH_NOSNAP; + sobject_t soid(latest->soid); + sobject_t head = soid; + head.snap = CEPH_NOSNAP; dout(10) << "recover_primary " << *latest << (latest->is_update() ? " (update)":"") - << (missing.is_missing(latest->oid) ? " (missing)":"") - << (missing.is_missing(head.oid) ? " (missing head)":"") - << (pulling.count(latest->oid) ? " (pulling)":"") - << (pulling.count(head.oid) ? " (pulling head)":"") + << (missing.is_missing(latest->soid) ? " (missing)":"") + << (missing.is_missing(head) ? " (missing head)":"") + << (pulling.count(latest->soid) ? " (pulling)":"") + << (pulling.count(head) ? " (pulling head)":"") << dendl; assert(latest->is_update()); - if (!pulling.count(latest->oid)) { - if (pulling.count(head.oid)) { + if (!pulling.count(latest->soid)) { + if (pulling.count(head)) { ++skipped; } else { // is this a clone operation that we can do locally? if (latest->op == Log::Entry::CLONE) { - if (missing.is_missing(head.oid) && - missing.have_old(head.oid) == latest->prior_version) { + if (missing.is_missing(head) && + missing.have_old(head) == latest->prior_version) { dout(10) << "recover_primary cloning " << head << " v" << latest->prior_version - << " to " << poid << " v" << latest->version + << " to " << soid << " v" << latest->version << " snaps " << latest->snaps << dendl; vector snaps; ::decode(snaps, latest->snaps); ObjectStore::Transaction t; - _make_clone(t, head, poid, latest->prior_version, latest->version, latest->reqid, latest->mtime, snaps); + _make_clone(t, head, soid, latest->prior_version, latest->version, latest->reqid, latest->mtime, snaps); osd->store->apply_transaction(t); - missing.got(latest->oid, latest->version); - missing_loc.erase(latest->oid); + missing.got(latest->soid, latest->version); + missing_loc.erase(latest->soid); continue; } } - if (pull(poid)) + if (pull(soid)) ++started; else ++skipped; @@ -2758,7 +2761,7 @@ int ReplicatedPG::recover_primary(int max) // only advance last_requested if we haven't skipped anything if (!skipped) - log.last_requested = latest->oid; + log.last_requested = latest->soid; } // done? @@ -2807,18 +2810,17 @@ int ReplicatedPG::recover_replicas(int max) continue; // oldest first! - object_t oid = peer_missing[peer].rmissing.begin()->second; - pobject_t poid(info.pgid.pool(), 0, oid); + sobject_t soid = peer_missing[peer].rmissing.begin()->second; eversion_t v = peer_missing[peer].rmissing.begin()->first; - push_to_replica(poid, peer); + push_to_replica(soid, peer); // do other peers need it too? for (i++; i= max) @@ -2859,56 +2861,53 @@ void ReplicatedPG::clean_up_local(ObjectStore::Transaction& t) if (ls.size() != info.stats.num_objects) dout(10) << " WARNING: " << ls.size() << " != num_objects " << info.stats.num_objects << dendl; - set s; + set s; for (vector::iterator i = ls.begin(); i != ls.end(); i++) - s.insert(i->oid); + s.insert(*i); - set did; + set did; for (list::reverse_iterator p = log.log.rbegin(); p != log.log.rend(); p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); + if (did.count(p->soid)) continue; + did.insert(p->soid); if (p->is_delete()) { - if (s.count(p->oid)) { - pobject_t poid(info.pgid.pool(), 0, p->oid); - dout(10) << " deleting " << poid + if (s.count(p->soid)) { + dout(10) << " deleting " << p->soid << " when " << p->version << dendl; - t.remove(info.pgid.to_coll(), poid); + t.remove(info.pgid.to_coll(), p->soid); } - s.erase(p->oid); + s.erase(p->soid); } else { // just leave old objects.. they're missing or whatever - s.erase(p->oid); + s.erase(p->soid); } } - for (set::iterator i = s.begin(); + for (set::iterator i = s.begin(); i != s.end(); i++) { - pobject_t poid(info.pgid.pool(), 0, *i); - dout(10) << " deleting stray " << poid << dendl; - t.remove(info.pgid.to_coll(), poid); + dout(10) << " deleting stray " << *i << dendl; + t.remove(info.pgid.to_coll(), *i); } } else { // just scan the log. - set did; + set did; for (list::reverse_iterator p = log.log.rbegin(); p != log.log.rend(); p++) { - if (did.count(p->oid)) continue; - did.insert(p->oid); + if (did.count(p->soid)) continue; + did.insert(p->soid); if (p->is_delete()) { - pobject_t poid(info.pgid.pool(), 0, p->oid); - dout(10) << " deleting " << poid + dout(10) << " deleting " << p->soid << " when " << p->version << dendl; - t.remove(info.pgid.to_coll(), poid); + t.remove(info.pgid.to_coll(), p->soid); } else { // keep old(+missing) objects, just for kicks. } @@ -2931,7 +2930,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap) coll_t c = info.pgid.to_coll(); // traverse in reverse order. - pobject_t head; + sobject_t head; SnapSet snapset; unsigned curclone = 0; @@ -2942,12 +2941,12 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap) for (vector::reverse_iterator p = scrubmap.objects.rbegin(); p != scrubmap.objects.rend(); p++) { - pobject_t poid = p->poid; + sobject_t soid = p->poid; stat.num_objects++; // basic checks. if (p->attrs.count(OI_ATTR) == 0) { - dout(0) << "scrub no '" << OI_ATTR << "' attr on " << poid << dendl; + dout(0) << "scrub no '" << OI_ATTR << "' attr on " << soid << dendl; errors++; continue; } @@ -2955,7 +2954,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap) bv.push_back(p->attrs[OI_ATTR]); object_info_t oi(bv); - dout(20) << "scrub " << poid << " " << oi << dendl; + dout(20) << "scrub " << soid << " " << oi << dendl; stat.num_bytes += p->size; stat.num_kb += SHIFT_ROUND_UP(p->size, 10); @@ -2965,11 +2964,11 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap) //assert(data.length() == p->size); // new head? - if (poid.oid.snap == CEPH_NOSNAP) { + if (soid.snap == CEPH_NOSNAP) { // it's a head. - if (head != pobject_t()) { + if (head != sobject_t()) { derr(0) << " missing clone(s) for " << head << dendl; - assert(head == pobject_t()); // we had better be done + assert(head == sobject_t()); // we had better be done errors++; } @@ -2979,7 +2978,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap) // what will be next? if (snapset.clones.empty()) - head = pobject_t(); // no clones. + head = sobject_t(); // no clones. else curclone = snapset.clones.size()-1; @@ -2995,13 +2994,13 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap) } } - } else if (poid.oid.snap) { + } else if (soid.snap) { // it's a clone - assert(head != pobject_t()); + assert(head != sobject_t()); stat.num_object_clones++; - assert(poid.oid.snap == snapset.clones[curclone]); + assert(soid.snap == snapset.clones[curclone]); assert(p->size == snapset.clone_size[curclone]); @@ -3011,7 +3010,7 @@ int ReplicatedPG::_scrub(ScrubMap& scrubmap) // what's next? curclone++; if (curclone == snapset.clones.size()) - head = pobject_t(); + head = sobject_t(); } else { // it's unversioned. diff --git a/src/osd/ReplicatedPG.h b/src/osd/ReplicatedPG.h index e9a5a668d68bf..cbde3e4929d6b 100644 --- a/src/osd/ReplicatedPG.h +++ b/src/osd/ReplicatedPG.h @@ -32,7 +32,7 @@ public: */ struct ProjectedObjectInfo { int ref; - pobject_t poid; + sobject_t poid; bool exists; __u64 size; @@ -142,9 +142,9 @@ protected: // projected object info - map projected_objects; + map projected_objects; - ProjectedObjectInfo *get_projected_object(pobject_t poid); + ProjectedObjectInfo *get_projected_object(sobject_t poid); void put_projected_object(ProjectedObjectInfo *pinfo); bool is_write_in_progress() { @@ -152,27 +152,27 @@ protected: } // load balancing - set balancing_reads; - set unbalancing_reads; - hash_map > waiting_for_unbalanced_reads; // i.e. primary-lock + set balancing_reads; + set unbalancing_reads; + hash_map > waiting_for_unbalanced_reads; // i.e. primary-lock // push/pull - map > pulling; // which objects are currently being pulled, and from where - map > pushing; + map > pulling; // which objects are currently being pulled, and from where + map > pushing; - void calc_head_subsets(SnapSet& snapset, pobject_t head, + void calc_head_subsets(SnapSet& snapset, sobject_t head, Missing& missing, interval_set<__u64>& data_subset, - map >& clone_subsets); - void calc_clone_subsets(SnapSet& snapset, pobject_t poid, Missing& missing, + map >& clone_subsets); + void calc_clone_subsets(SnapSet& snapset, sobject_t poid, Missing& missing, interval_set<__u64>& data_subset, - map >& clone_subsets); - void push_to_replica(pobject_t oid, int dest); - void push(pobject_t oid, int dest); - void push(pobject_t oid, int dest, interval_set<__u64>& data_subset, - map >& clone_subsets); - bool pull(pobject_t oid); + map >& clone_subsets); + void push_to_replica(sobject_t oid, int dest); + void push(sobject_t oid, int dest); + void push(sobject_t oid, int dest, interval_set<__u64>& data_subset, + map >& clone_subsets); + bool pull(sobject_t oid); // modify @@ -180,17 +180,17 @@ protected: void sub_op_modify_ondisk(MOSDSubOp *op, int ackerosd, eversion_t last_complete); void _make_clone(ObjectStore::Transaction& t, - pobject_t head, pobject_t coid, + sobject_t head, sobject_t coid, eversion_t ov, eversion_t v, osd_reqid_t& reqid, utime_t mtime, vector& snaps); void prepare_clone(ObjectStore::Transaction& t, bufferlist& logbl, osd_reqid_t reqid, pg_stat_t& st, - pobject_t poid, loff_t old_size, object_info_t& oi, + sobject_t poid, loff_t old_size, object_info_t& oi, eversion_t& at_version, SnapContext& snapc); void add_interval_usage(interval_set<__u64>& s, pg_stat_t& st); int prepare_simple_op(ObjectStore::Transaction& t, osd_reqid_t reqid, pg_stat_t& st, - pobject_t poid, __u64& old_size, bool& exists, object_info_t& oi, + sobject_t poid, __u64& old_size, bool& exists, object_info_t& oi, vector& ops, int opn, bufferlist::iterator& bp, SnapContext& snapc); void prepare_transaction(ObjectStore::Transaction& t, osd_reqid_t reqid, - pobject_t poid, + sobject_t poid, vector& ops, bufferlist& bl, utime_t mtime, bool& exists, __u64& size, object_info_t& oi, eversion_t at_version, SnapContext& snapc, @@ -210,7 +210,7 @@ protected: int recover_primary(int max); int recover_replicas(int max); - int pick_read_snap(pobject_t& poid, object_info_t& coi); + int pick_read_snap(sobject_t& poid, object_info_t& coi); void op_read(MOSDOp *op); void op_modify(MOSDOp *op); @@ -243,8 +243,8 @@ public: bool same_for_modify_since(epoch_t e); bool same_for_rep_modify_since(epoch_t e); - bool is_missing_object(object_t oid); - void wait_for_missing_object(object_t oid, Message *op); + bool is_missing_object(sobject_t oid); + void wait_for_missing_object(sobject_t oid, Message *op); void on_osd_failure(int o); void on_acker_change(); diff --git a/src/osd/osd_types.h b/src/osd/osd_types.h index 190ba850f1219..a5135228c7759 100644 --- a/src/osd/osd_types.h +++ b/src/osd/osd_types.h @@ -19,7 +19,6 @@ #include "msg/msg_types.h" #include "include/types.h" -#include "include/pobject.h" #include "include/interval_set.h" #include "include/nstring.h" @@ -105,7 +104,7 @@ enum { //#define CEPH_POOL(poolset, size) (((poolset) << 8) + (size)) -#define OSD_SUPERBLOCK_POBJECT pobject_t(CEPH_OSDMETADATA_NS, 0, object_t(0,0)) +#define OSD_SUPERBLOCK_POBJECT sobject_t(object_t(0,0), 0) // placement group id struct pg_t { @@ -132,9 +131,7 @@ struct pg_t { operator uint64_t() const { return u.pg64; } pobject_t to_log_pobject() const { - return pobject_t(CEPH_OSDMETADATA_NS, - 0, - object_t(u.pg64, 0)); + return pobject_t(object_t(u.pg64, 0), CEPH_NOSNAP); } coll_t to_coll() const { @@ -746,7 +743,7 @@ struct object_info_t { ::encode(prior_version, bl); ::encode(last_reqid, bl); ::encode(mtime, bl); - if (poid.oid.snap == CEPH_NOSNAP) { + if (poid.snap == CEPH_NOSNAP) { ::encode(snapset, bl); ::encode(wrlock_by, bl); } else @@ -759,7 +756,7 @@ struct object_info_t { ::decode(prior_version, bl); ::decode(last_reqid, bl); ::decode(mtime, bl); - if (poid.oid.snap == CEPH_NOSNAP) { + if (poid.snap == CEPH_NOSNAP) { ::decode(snapset, bl); ::decode(wrlock_by, bl); } else @@ -784,7 +781,7 @@ inline ostream& operator<<(ostream& out, const object_info_t& oi) { << " " << oi.last_reqid; if (oi.wrlock_by.tid) out << " wrlock_by=" << oi.wrlock_by; - if (oi.poid.oid.snap == CEPH_NOSNAP) + if (oi.poid.snap == CEPH_NOSNAP) out << " " << oi.snapset << ")"; else out << " " << oi.snaps << ")"; diff --git a/src/osdc/Filer.cc b/src/osdc/Filer.cc index 309bc97db66d5..9389be308f8f3 100644 --- a/src/osdc/Filer.cc +++ b/src/osdc/Filer.cc @@ -88,14 +88,14 @@ void Filer::_probe(Probe *probe) << dendl; // map range onto objects - file_to_extents(probe->ino, &probe->layout, probe->snapid, probe->from, probe->probing_len, probe->probing); + file_to_extents(probe->ino, &probe->layout, probe->from, probe->probing_len, probe->probing); for (vector::iterator p = probe->probing.begin(); p != probe->probing.end(); p++) { dout(10) << "_probe probing " << p->oid << dendl; C_Probe *c = new C_Probe(this, probe, p->oid); - probe->ops[p->oid] = objecter->stat(p->oid, p->layout, &c->size, probe->flags, c); + probe->ops[p->oid] = objecter->stat(p->oid, p->layout, probe->snapid, &c->size, probe->flags, c); } } @@ -192,7 +192,7 @@ void Filer::_probed(Probe *probe, object_t oid, __u64 size) } -void Filer::file_to_extents(inodeno_t ino, ceph_file_layout *layout, snapid_t snap, +void Filer::file_to_extents(inodeno_t ino, ceph_file_layout *layout, __u64 offset, size_t len, vector& extents) { @@ -226,7 +226,7 @@ void Filer::file_to_extents(inodeno_t ino, ceph_file_layout *layout, snapid_t sn // find oid, extent ObjectExtent *ex = 0; - object_t oid( ino, objectno, snap ); + object_t oid( ino, objectno ); if (object_extents.count(oid)) ex = &object_extents[oid]; else { diff --git a/src/osdc/Filer.h b/src/osdc/Filer.h index 6fd1ae5dcece7..e2971887da13d 100644 --- a/src/osdc/Filer.h +++ b/src/osdc/Filer.h @@ -86,9 +86,8 @@ class Filer { * map (ino, layout, offset, len) to a (list of) OSDExtents (byte * ranges in objects on (primary) osds) */ - void file_to_extents(inodeno_t ino, ceph_file_layout *layout, snapid_t snap, - __u64 offset, - size_t len, + void file_to_extents(inodeno_t ino, ceph_file_layout *layout, + __u64 offset, size_t len, vector& extents); @@ -98,16 +97,16 @@ class Filer { int read(inodeno_t ino, ceph_file_layout *layout, - snapid_t snapid, + snapid_t snap, __u64 offset, size_t len, bufferlist *bl, // ptr to data int flags, Context *onfinish) { - assert(snapid); // (until there is a non-NOSNAP write) + assert(snap); // (until there is a non-NOSNAP write) vector extents; - file_to_extents(ino, layout, snapid, offset, len, extents); - objecter->sg_read(extents, bl, flags, onfinish); + file_to_extents(ino, layout, offset, len, extents); + objecter->sg_read(extents, snap, bl, flags, onfinish); return 0; } @@ -123,7 +122,7 @@ class Filer { Context *onack, Context *oncommit) { vector extents; - file_to_extents(ino, layout, CEPH_NOSNAP, offset, len, extents); + file_to_extents(ino, layout, offset, len, extents); objecter->sg_write(extents, snapc, bl, mtime, flags, onack, oncommit); return 0; } @@ -140,7 +139,7 @@ class Filer { Context *oncommit) { bufferlist bl; vector extents; - file_to_extents(ino, layout, CEPH_NOSNAP, offset, len, extents); + file_to_extents(ino, layout, offset, len, extents); if (extents.size() == 1) { vector ops(1); memset(&ops[0], 0, sizeof(ops[0])); @@ -178,7 +177,7 @@ class Filer { Context *onack, Context *oncommit) { vector extents; - file_to_extents(ino, layout, CEPH_NOSNAP, offset, len, extents); + file_to_extents(ino, layout, offset, len, extents); if (extents.size() == 1) { objecter->zero(extents[0].oid, extents[0].layout, extents[0].offset, extents[0].length, snapc, mtime, flags, onack, oncommit); @@ -208,7 +207,7 @@ class Filer { Context *onack, Context *oncommit) { vector extents; - file_to_extents(ino, layout, CEPH_NOSNAP, offset, len, extents); + file_to_extents(ino, layout, offset, len, extents); if (extents.size() == 1) { objecter->remove(extents[0].oid, extents[0].layout, snapc, mtime, flags, onack, oncommit); diff --git a/src/osdc/Journaler.cc b/src/osdc/Journaler.cc index d1877ea8c8c92..ed2d546fbd198 100644 --- a/src/osdc/Journaler.cc +++ b/src/osdc/Journaler.cc @@ -92,7 +92,7 @@ void Journaler::recover(Context *onread) object_t oid(ino, 0); ceph_object_layout ol = objecter->osdmap->make_object_layout(oid, pg_pool); - objecter->read_full(oid, ol, &fin->bl, 0, fin); + objecter->read_full(oid, ol, CEPH_NOSNAP, &fin->bl, 0, fin); } void Journaler::_finish_read_head(int r, bufferlist& bl) diff --git a/src/osdc/ObjectCacher.cc b/src/osdc/ObjectCacher.cc index bfbf700f91932..96ecf002ed8e9 100644 --- a/src/osdc/ObjectCacher.cc +++ b/src/osdc/ObjectCacher.cc @@ -140,7 +140,7 @@ int ObjectCacher::Object::map_read(OSDRead *rd, ex_it != rd->extents.end(); ex_it++) { - if (ex_it->oid != oid) continue; + if (ex_it->oid != oid.oid) continue; dout(10) << "map_read " << ex_it->oid << " " << ex_it->offset << "~" << ex_it->length << dendl; @@ -231,7 +231,7 @@ ObjectCacher::BufferHead *ObjectCacher::Object::map_write(OSDWrite *wr) ex_it != wr->extents.end(); ex_it++) { - if (ex_it->oid != oid) continue; + if (ex_it->oid != oid.oid) continue; dout(10) << "map_write oex " << ex_it->oid << " " << ex_it->offset << "~" << ex_it->length << dendl; @@ -384,7 +384,7 @@ void ObjectCacher::close_object(Object *ob) assert(ob->can_close()); // ok! - objects.erase(ob->get_oid()); + objects.erase(ob->get_soid()); objects_by_ino[ob->get_ino()].erase(ob); if (objects_by_ino[ob->get_ino()].empty()) objects_by_ino.erase(ob->get_ino()); @@ -401,20 +401,20 @@ void ObjectCacher::bh_read(BufferHead *bh) mark_rx(bh); // finisher - C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_oid(), bh->start(), bh->length()); + C_ReadFinish *onfinish = new C_ReadFinish(this, bh->ob->get_soid(), bh->start(), bh->length()); // go objecter->read(bh->ob->get_oid(), bh->ob->get_layout(), - bh->start(), bh->length(), + bh->start(), bh->length(), bh->ob->get_snap(), &onfinish->bl, 0, onfinish); } -void ObjectCacher::bh_read_finish(object_t oid, loff_t start, size_t length, bufferlist &bl) +void ObjectCacher::bh_read_finish(sobject_t oid, loff_t start, size_t length, bufferlist &bl) { //lock.Lock(); dout(7) << "bh_read_finish " - << oid + << oid << " " << start << "~" << length << " (bl is " << bl.length() << ")" << dendl; @@ -491,12 +491,12 @@ void ObjectCacher::bh_write(BufferHead *bh) dout(7) << "bh_write " << *bh << dendl; // finishers - C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_oid(), bh->start(), bh->length()); - C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_oid(), bh->start(), bh->length()); + C_WriteAck *onack = new C_WriteAck(this, bh->ob->get_soid(), bh->start(), bh->length()); + C_WriteCommit *oncommit = new C_WriteCommit(this, bh->ob->get_soid(), bh->start(), bh->length()); // go tid_t tid = objecter->write(bh->ob->get_oid(), bh->ob->get_layout(), - bh->start(), bh->length(), + bh->start(), bh->length(), bh->snapc, bh->bl, bh->last_write, 0, onack, oncommit); @@ -512,12 +512,12 @@ void ObjectCacher::bh_write(BufferHead *bh) mark_tx(bh); } -void ObjectCacher::lock_ack(list& oids, tid_t tid) +void ObjectCacher::lock_ack(list& oids, tid_t tid) { - for (list::iterator i = oids.begin(); + for (list::iterator i = oids.begin(); i != oids.end(); i++) { - object_t oid = *i; + sobject_t oid = *i; if (objects.count(oid) == 0) { dout(7) << "lock_ack no object cache" << dendl; @@ -574,7 +574,7 @@ void ObjectCacher::lock_ack(list& oids, tid_t tid) } } -void ObjectCacher::bh_write_ack(object_t oid, loff_t start, size_t length, tid_t tid) +void ObjectCacher::bh_write_ack(sobject_t oid, loff_t start, size_t length, tid_t tid) { //lock.Lock(); @@ -643,7 +643,7 @@ void ObjectCacher::bh_write_ack(object_t oid, loff_t start, size_t length, tid_t //lock.Unlock(); } -void ObjectCacher::bh_write_commit(object_t oid, loff_t start, size_t length, tid_t tid) +void ObjectCacher::bh_write_commit(sobject_t oid, loff_t start, size_t length, tid_t tid) { //lock.Lock(); @@ -764,7 +764,8 @@ int ObjectCacher::readx(OSDRead *rd, inodeno_t ino, Context *onfinish) dout(10) << "readx " << *ex_it << dendl; // get Object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); + sobject_t soid(ex_it->oid, rd->snap); + Object *o = get_object(soid, ino, ex_it->layout); // map extent into bufferheads map hits, missing, rx; @@ -901,7 +902,8 @@ int ObjectCacher::writex(OSDWrite *wr, inodeno_t ino) ex_it != wr->extents.end(); ex_it++) { // get object cache - Object *o = get_object(ex_it->oid, ino, ex_it->layout); + sobject_t soid(ex_it->oid, CEPH_NOSNAP); + Object *o = get_object(soid, ino, ex_it->layout); // map it all into a single bufferhead. BufferHead *bh = o->map_write(wr); @@ -1039,7 +1041,7 @@ int ObjectCacher::atomic_sync_readx(OSDRead *rd, inodeno_t ino, Mutex& lock) bool done = false; objecter->read(rd->extents[0].oid, rd->extents[0].layout, rd->extents[0].offset, rd->extents[0].length, - rd->bl, 0, + rd->snap, rd->bl, 0, new C_SafeCond(&flock, &cond, &done)); // block @@ -1058,7 +1060,8 @@ int ObjectCacher::atomic_sync_readx(OSDRead *rd, inodeno_t ino, Mutex& lock) for (map::iterator i = by_oid.begin(); i != by_oid.end(); i++) { - Object *o = get_object(i->first, ino, i->second.layout); + sobject_t soid(i->first, rd->snap); + Object *o = get_object(soid, ino, i->second.layout); rdlock(o); } @@ -1078,8 +1081,9 @@ int ObjectCacher::atomic_sync_readx(OSDRead *rd, inodeno_t ino, Mutex& lock) for (vector::iterator ex_it = extents.begin(); ex_it != extents.end(); ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; + sobject_t soid(ex_it->oid, rd->snap); + assert(objects.count(soid)); + Object *o = objects[soid]; rdunlock(o); } } @@ -1098,7 +1102,7 @@ int ObjectCacher::atomic_sync_writex(OSDWrite *wr, inodeno_t ino, Mutex& lock) // single object. // make sure we aren't already locking/locked... - object_t oid = wr->extents.front().oid; + sobject_t oid(wr->extents.front().oid, CEPH_NOSNAP); Object *o = 0; if (objects.count(oid)) o = get_object(oid, ino, wr->extents.front().layout); if (!o || @@ -1135,7 +1139,8 @@ int ObjectCacher::atomic_sync_writex(OSDWrite *wr, inodeno_t ino, Mutex& lock) for (map::iterator i = by_oid.begin(); i != by_oid.end(); i++) { - Object *o = get_object(i->first, ino, i->second.layout); + sobject_t soid(i->first, CEPH_NOSNAP); + Object *o = get_object(soid, ino, i->second.layout); wrlock(o); } @@ -1150,8 +1155,9 @@ int ObjectCacher::atomic_sync_writex(OSDWrite *wr, inodeno_t ino, Mutex& lock) for (vector::iterator ex_it = extents.begin(); ex_it != extents.end(); ex_it++) { - assert(objects.count(ex_it->oid)); - Object *o = objects[ex_it->oid]; + sobject_t soid(ex_it->oid, CEPH_NOSNAP); + assert(objects.count(soid)); + Object *o = objects[soid]; wrunlock(o); } @@ -1173,8 +1179,8 @@ void ObjectCacher::rdlock(Object *o) o->lock_state = Object::LOCK_RDLOCKING; - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); + C_LockAck *ack = new C_LockAck(this, o->get_soid()); + C_WriteCommit *commit = new C_WriteCommit(this, o->get_soid(), 0, 0); commit->tid = ack->tid = @@ -1217,8 +1223,8 @@ void ObjectCacher::wrlock(Object *o) op = CEPH_OSD_OP_WRLOCK; } - C_LockAck *ack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); + C_LockAck *ack = new C_LockAck(this, o->get_soid()); + C_WriteCommit *commit = new C_WriteCommit(this, o->get_soid(), 0, 0); commit->tid = ack->tid = @@ -1262,8 +1268,8 @@ void ObjectCacher::rdunlock(Object *o) o->lock_state = Object::LOCK_RDUNLOCKING; - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); + C_LockAck *lockack = new C_LockAck(this, o->get_soid()); + C_WriteCommit *commit = new C_WriteCommit(this, o->get_soid(), 0, 0); commit->tid = lockack->tid = o->last_write_tid = objecter->lock(o->get_oid(), o->get_layout(), CEPH_OSD_OP_RDUNLOCK, 0, lockack, commit); @@ -1294,8 +1300,8 @@ void ObjectCacher::wrunlock(Object *o) o->lock_state = Object::LOCK_WRUNLOCKING; } - C_LockAck *lockack = new C_LockAck(this, o->get_oid()); - C_WriteCommit *commit = new C_WriteCommit(this, o->get_oid(), 0, 0); + C_LockAck *lockack = new C_LockAck(this, o->get_soid()); + C_WriteCommit *commit = new C_WriteCommit(this, o->get_soid(), 0, 0); commit->tid = lockack->tid = o->last_write_tid = objecter->lock(o->get_oid(), o->get_layout(), op, 0, lockack, commit); @@ -1576,8 +1582,10 @@ void ObjectCacher::truncate_set(inodeno_t ino, vector& exls) p != exls.end(); ++p) { ObjectExtent &ex = *p; - if (objects.count(ex.oid) == 0) continue; - Object *ob = objects[ex.oid]; + sobject_t soid(ex.oid, CEPH_NOSNAP); + if (objects.count(soid) == 0) + continue; + Object *ob = objects[soid]; // purge or truncate? if (ex.offset == 0) { diff --git a/src/osdc/ObjectCacher.h b/src/osdc/ObjectCacher.h index f16b27e8949e7..1b6ac910bec74 100644 --- a/src/osdc/ObjectCacher.h +++ b/src/osdc/ObjectCacher.h @@ -26,14 +26,15 @@ class ObjectCacher { // read scatter/gather struct OSDRead { vector extents; + snapid_t snap; map read_data; // bits of data as they come back bufferlist *bl; int flags; - OSDRead(bufferlist *b, int f) : bl(b), flags(f) {} + OSDRead(snapid_t s, bufferlist *b, int f) : snap(s), bl(b), flags(f) {} }; - OSDRead *prepare_read(bufferlist *b, int f) { - return new OSDRead(b, f); + OSDRead *prepare_read(snapid_t snap, bufferlist *b, int f) { + return new OSDRead(snap, b, f); } // write scatter/gather @@ -129,7 +130,7 @@ class ObjectCacher { private: // ObjectCacher::Object fields ObjectCacher *oc; - object_t oid; // this _always_ is oid.rev=0 + sobject_t oid; inodeno_t ino; ceph_object_layout layout; @@ -162,7 +163,7 @@ class ObjectCacher { int rdlock_ref; // how many ppl want or are using a READ lock public: - Object(ObjectCacher *_oc, object_t o, inodeno_t i, ceph_object_layout& l) : + Object(ObjectCacher *_oc, sobject_t o, inodeno_t i, ceph_object_layout& l) : oc(_oc), oid(o), ino(i), layout(l), last_write_tid(0), last_ack_tid(0), last_commit_tid(0), @@ -173,7 +174,9 @@ class ObjectCacher { assert(data.empty()); } - object_t get_oid() { return oid; } + sobject_t get_soid() { return oid; } + object_t get_oid() { return oid.oid; } + snapid_t get_snap() { return oid.snap; } inodeno_t get_ino() { return ino; } ceph_object_layout& get_layout() { return layout; } @@ -243,7 +246,7 @@ class ObjectCacher { flush_set_callback_t flush_set_callback, commit_set_callback; void *flush_set_callback_arg; - hash_map objects; + hash_map objects; hash_map > objects_by_ino; hash_map dirty_tx_by_ino; hash_map > uncommitted_by_ino; @@ -266,7 +269,7 @@ class ObjectCacher { // objects - Object *get_object(object_t oid, inodeno_t ino, ceph_object_layout &l) { + Object *get_object(sobject_t oid, inodeno_t ino, ceph_object_layout &l) { // have it? if (objects.count(oid)) return objects[oid]; @@ -404,19 +407,19 @@ class ObjectCacher { void wrunlock(Object *o); public: - void bh_read_finish(object_t oid, loff_t offset, size_t length, bufferlist &bl); - void bh_write_ack(object_t oid, loff_t offset, size_t length, tid_t t); - void bh_write_commit(object_t oid, loff_t offset, size_t length, tid_t t); - void lock_ack(list& oids, tid_t tid); + void bh_read_finish(sobject_t oid, loff_t offset, size_t length, bufferlist &bl); + void bh_write_ack(sobject_t oid, loff_t offset, size_t length, tid_t t); + void bh_write_commit(sobject_t oid, loff_t offset, size_t length, tid_t t); + void lock_ack(list& oids, tid_t tid); class C_ReadFinish : public Context { ObjectCacher *oc; - object_t oid; + sobject_t oid; loff_t start; size_t length; public: bufferlist bl; - C_ReadFinish(ObjectCacher *c, object_t o, loff_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} + C_ReadFinish(ObjectCacher *c, sobject_t o, loff_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} void finish(int r) { oc->bh_read_finish(oid, start, length, bl); } @@ -424,24 +427,24 @@ class ObjectCacher { class C_WriteAck : public Context { ObjectCacher *oc; - object_t oid; + sobject_t oid; loff_t start; size_t length; public: tid_t tid; - C_WriteAck(ObjectCacher *c, object_t o, loff_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} + C_WriteAck(ObjectCacher *c, sobject_t o, loff_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} void finish(int r) { oc->bh_write_ack(oid, start, length, tid); } }; class C_WriteCommit : public Context { ObjectCacher *oc; - object_t oid; + sobject_t oid; loff_t start; size_t length; public: tid_t tid; - C_WriteCommit(ObjectCacher *c, object_t o, loff_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} + C_WriteCommit(ObjectCacher *c, sobject_t o, loff_t s, size_t l) : oc(c), oid(o), start(s), length(l) {} void finish(int r) { oc->bh_write_commit(oid, start, length, tid); } @@ -450,9 +453,9 @@ class ObjectCacher { class C_LockAck : public Context { ObjectCacher *oc; public: - list oids; + list oids; tid_t tid; - C_LockAck(ObjectCacher *c, object_t o) : oc(c) { + C_LockAck(ObjectCacher *c, sobject_t o) : oc(c) { oids.push_back(o); } void finish(int r) { @@ -546,8 +549,8 @@ class ObjectCacher { bufferlist *bl, int flags, Context *onfinish) { - OSDRead *rd = prepare_read(bl, flags); - filer.file_to_extents(ino, layout, snapid, offset, len, rd->extents); + OSDRead *rd = prepare_read(snapid, bl, flags); + filer.file_to_extents(ino, layout, offset, len, rd->extents); return readx(rd, ino, onfinish); } @@ -555,7 +558,7 @@ class ObjectCacher { loff_t offset, size_t len, bufferlist& bl, utime_t mtime, int flags) { OSDWrite *wr = prepare_write(snapc, bl, mtime, flags); - filer.file_to_extents(ino, layout, CEPH_NOSNAP, offset, len, wr->extents); + filer.file_to_extents(ino, layout, offset, len, wr->extents); return writex(wr, ino); } @@ -568,8 +571,8 @@ class ObjectCacher { loff_t offset, size_t len, bufferlist *bl, int flags, Mutex &lock) { - OSDRead *rd = prepare_read(bl, flags); - filer.file_to_extents(ino, layout, snapid, offset, len, rd->extents); + OSDRead *rd = prepare_read(snapid, bl, flags); + filer.file_to_extents(ino, layout, offset, len, rd->extents); return atomic_sync_readx(rd, ino, lock); } @@ -579,7 +582,7 @@ class ObjectCacher { bufferlist& bl, utime_t mtime, int flags, Mutex &lock) { OSDWrite *wr = prepare_write(snapc, bl, mtime, flags); - filer.file_to_extents(ino, layout, CEPH_NOSNAP, offset, len, wr->extents); + filer.file_to_extents(ino, layout, offset, len, wr->extents); return atomic_sync_writex(wr, ino, lock); } @@ -605,7 +608,7 @@ inline ostream& operator<<(ostream& out, ObjectCacher::BufferHead &bh) inline ostream& operator<<(ostream& out, ObjectCacher::Object &ob) { out << "object[" - << hex << ob.get_oid() << " ino " << ob.get_ino() << dec + << ob.get_soid() << " ino " << hex << ob.get_ino() << dec << " wr " << ob.last_write_tid << "/" << ob.last_ack_tid << "/" << ob.last_commit_tid; switch (ob.lock_state) { diff --git a/src/osdc/Objecter.h b/src/osdc/Objecter.h index 10d0fe2876b11..62b4581170d35 100644 --- a/src/osdc/Objecter.h +++ b/src/osdc/Objecter.h @@ -169,6 +169,7 @@ class Objecter { object_t oid; ceph_object_layout layout; vector ops; + snapid_t snap; bufferlist bl; bufferlist *pbl; __u64 *psize; @@ -181,8 +182,8 @@ class Objecter { bool paused; - ReadOp(object_t o, ceph_object_layout& ol, vector& op, int f, Context *of) : - oid(o), layout(ol), + ReadOp(object_t o, ceph_object_layout& ol, vector& op, snapid_t s, int f, Context *of) : + oid(o), layout(ol), snap(s), pbl(0), psize(0), flags(f), onfinish(of), tid(0), attempts(0), inc_lock(-1), paused(false) { @@ -301,17 +302,17 @@ class Objecter { tid_t read_submit(ReadOp *rd); tid_t modify_submit(ModifyOp *wr); - tid_t read(object_t oid, ceph_object_layout ol, vector& ops, - bufferlist *pbl, __u64 *psize, int flags, + tid_t read(object_t oid, ceph_object_layout ol, vector& ops, + snapid_t snap, bufferlist *pbl, __u64 *psize, int flags, Context *onfinish) { - ReadOp *rd = new ReadOp(oid, ol, ops, flags, onfinish); + ReadOp *rd = new ReadOp(oid, ol, ops, snap, flags, onfinish); rd->pbl = pbl; rd->psize = psize; return read_submit(rd); } tid_t read(object_t oid, ceph_object_layout ol, - ObjectRead& read, bufferlist *pbl, int flags, Context *onfinish) { - ReadOp *rd = new ReadOp(oid, ol, read.ops, flags, onfinish); + ObjectRead& read, snapid_t snap, bufferlist *pbl, int flags, Context *onfinish) { + ReadOp *rd = new ReadOp(oid, ol, read.ops, snap, flags, onfinish); rd->bl = read.data; rd->pbl = pbl; return read_submit(rd); @@ -326,29 +327,29 @@ class Objecter { } // high-level helpers - tid_t stat(object_t oid, ceph_object_layout ol, + tid_t stat(object_t oid, ceph_object_layout ol, snapid_t snap, __u64 *psize, int flags, Context *onfinish) { vector ops(1); memset(&ops[0], 0, sizeof(ops[0])); ops[0].op = CEPH_OSD_OP_STAT; - return read(oid, ol, ops, 0, psize, flags, onfinish); + return read(oid, ol, ops, snap, 0, psize, flags, onfinish); } - tid_t read(object_t oid, ceph_object_layout ol, - __u64 off, size_t len, bufferlist *pbl, int flags, + tid_t read(object_t oid, ceph_object_layout ol, + __u64 off, size_t len, snapid_t snap, bufferlist *pbl, int flags, Context *onfinish) { vector ops(1); memset(&ops[0], 0, sizeof(ops[0])); ops[0].op = CEPH_OSD_OP_READ; ops[0].offset = off; ops[0].length = len; - return read(oid, ol, ops, pbl, 0, flags, onfinish); + return read(oid, ol, ops, snap, pbl, 0, flags, onfinish); } tid_t read_full(object_t oid, ceph_object_layout ol, - bufferlist *pbl, int flags, + snapid_t snap, bufferlist *pbl, int flags, Context *onfinish) { - return read(oid, ol, 0, 0, pbl, flags, onfinish); + return read(oid, ol, 0, 0, snap, pbl, flags, onfinish); } tid_t mutate(object_t oid, ceph_object_layout ol, @@ -429,17 +430,17 @@ class Objecter { } }; - void sg_read(vector& extents, bufferlist *bl, int flags, Context *onfinish) { + void sg_read(vector& extents, snapid_t snap, bufferlist *bl, int flags, Context *onfinish) { if (extents.size() == 1) { read(extents[0].oid, extents[0].layout, extents[0].offset, extents[0].length, - bl, flags, onfinish); + snap, bl, flags, onfinish); } else { C_Gather *g = new C_Gather; vector resultbl(extents.size()); int i=0; for (vector::iterator p = extents.begin(); p != extents.end(); p++) { read(p->oid, p->layout, p->offset, p->length, - &resultbl[i++], flags, g->new_sub()); + snap, &resultbl[i++], flags, g->new_sub()); } g->set_finisher(new C_SGRead(this, extents, resultbl, bl, onfinish)); } diff --git a/src/streamtest.cc b/src/streamtest.cc index ec28247a82508..f866665184253 100644 --- a/src/streamtest.cc +++ b/src/streamtest.cc @@ -117,7 +117,7 @@ int main(int argc, const char **argv) //cout << "stop at " << end << std::endl; cout << "# offset\tack\tcommit" << std::endl; while (now < end) { - pobject_t poid(0, 0, object_t(1, 1)); + pobject_t poid(object_t(1, 1), 0); utime_t start = now; set_start(pos, now); ObjectStore::Transaction t; -- 2.39.5