From ba65cd701206cf04ecf6a5ebf40281a44c225b90 Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Tue, 13 May 2008 14:49:29 -0700 Subject: [PATCH] osd: put full precision ps in osd ops; keep in mind that ps -> pg mapping may shift as osdmap is updated --- src/Makefile.am | 5 +- src/dumpjournal.cc | 131 ++++++++++++++++++++++++++++++++++++++++++ src/include/ceph_fs.h | 2 +- src/kernel/osdmap.c | 3 +- src/messages/MOSDOp.h | 1 + src/osd/OSD.cc | 12 +++- src/osd/OSDMap.h | 38 ++++++++---- 7 files changed, 175 insertions(+), 17 deletions(-) create mode 100644 src/dumpjournal.cc diff --git a/src/Makefile.am b/src/Makefile.am index 73db5362bccaf..5b564c996225d 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -32,6 +32,9 @@ mkfs_ebofs_LDADD = libebofs.a libcommon.a libos.a cmds_SOURCES = cmds.cc msg/SimpleMessenger.cc cmds_LDADD = libmds.a libosdc.a libcrush.a libcommon.a +dumpjournal_SOURCES = dumpjournal.cc msg/SimpleMessenger.cc +dumpjournal_LDADD = libosdc.a libcrush.a libcommon.a + # osd cosd_SOURCES = cosd.cc msg/SimpleMessenger.cc cosd_LDADD = libosd.a libos.a libebofs.a libcrush.a libcommon.a @@ -139,7 +142,7 @@ bin_PROGRAMS = \ cmonctl \ mkmonfs monmaptool osdmaptool crushtool \ fakesyn \ - streamtest dupstore psim \ + streamtest dupstore psim dumpjournal \ test.ebofs mkfs.ebofs \ $(FUSEBIN) $(NEWSYN) noinst_LIBRARIES = \ diff --git a/src/dumpjournal.cc b/src/dumpjournal.cc new file mode 100644 index 0000000000000..c9c00bf972eac --- /dev/null +++ b/src/dumpjournal.cc @@ -0,0 +1,131 @@ +// -*- mode:C++; tab-width:8; c-basic-offset:2; indent-tabs-mode:t -*- +// vim: ts=8 sw=2 smarttab +/* + * Ceph - scalable distributed file system + * + * Copyright (C) 2004-2006 Sage Weil + * + * This is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License version 2.1, as published by the Free Software + * Foundation. See file COPYING. + * + */ + +#include +#include +#include +using namespace std; + +#include "config.h" + +#include "mon/MonMap.h" +#include "mon/MonClient.h" +#include "msg/SimpleMessenger.h" +#include "osd/OSDMap.h" +#include "messages/MOSDGetMap.h" +#include "osdc/Objecter.h" +#include "osdc/Journaler.h" +#include "mds/mdstypes.h" + +#include "common/Timer.h" + +#ifndef DARWIN +#include +#endif // DARWIN + +#include +#include +#include + + +OSDMap osdmap; +Mutex lock; +Cond cond; + +Messenger *messenger = 0; +Objecter *objecter = 0; +Journaler *journaler = 0; + +class Dumper : public Dispatcher { + void dispatch(Message *m) { + switch (m->get_type()) { + case CEPH_MSG_OSD_OPREPLY: + objecter->handle_osd_op_reply((MOSDOpReply *)m); + break; + case CEPH_MSG_OSD_MAP: + objecter->handle_osd_map((MOSDMap*)m); + break; + } + } +} dispatcher; + + +void usage() +{ + exit(1); +} + +int main(int argc, const char **argv, const char *envp[]) +{ + vector args; + argv_to_vec(argc, argv, args); + parse_config_options(args); + + vec_to_argv(args, argc, argv); + + int mds = 0; + + // get monmap + MonMap monmap; + MonClient mc; + if (mc.get_monmap(&monmap) < 0) + return -1; + + // start up network + rank.bind(); + g_conf.daemonize = false; // not us! + rank.start(); + messenger = rank.register_entity(entity_name_t::ADMIN()); + messenger->set_dispatcher(&dispatcher); + + inode_t log_inode; + memset(&log_inode, 0, sizeof(log_inode)); + log_inode.ino = MDS_INO_LOG_OFFSET + mds; + log_inode.layout = g_default_mds_log_layout; + + objecter = new Objecter(messenger, &monmap, &osdmap, lock); + journaler = new Journaler(log_inode, objecter, 0, &lock); + + objecter->set_client_incarnation(0); + + bool done; + journaler->recover(new C_SafeCond(&lock, &cond, &done)); + lock.Lock(); + while (!done) + cond.Wait(lock); + lock.Unlock(); + + __u64 start = journaler->get_read_pos(); + __u64 end = journaler->get_write_pos(); + __u64 len = end-start; + cout << "journal is " << start << "~" << len << std::endl; + + Filer filer(objecter); + bufferlist bl; + filer.read(log_inode, start, len, &bl, 0, new C_SafeCond(&lock, &cond, &done)); + lock.Lock(); + while (!done) + cond.Wait(lock); + lock.Unlock(); + + cout << "read " << bl.length() << " bytes" << std::endl; + bl.write_file("mds.journal.dump"); + messenger->shutdown(); + + // wait for messenger to finish + rank.wait(); + + return 0; +} + diff --git a/src/include/ceph_fs.h b/src/include/ceph_fs.h index e2f21de01897d..7e866cb71639e 100644 --- a/src/include/ceph_fs.h +++ b/src/include/ceph_fs.h @@ -187,7 +187,7 @@ static inline int ceph_stable_mod(int x, int b, int bmask) { * object layout - how a given object should be stored. */ struct ceph_object_layout { - __le64 ol_pgid; + __le64 ol_pgid; /* raw pg, with _full_ ps precision. */ __le32 ol_stripe_unit; } __attribute__ ((packed)); diff --git a/src/kernel/osdmap.c b/src/kernel/osdmap.c index e44ab719a25a2..910a08a5b3979 100644 --- a/src/kernel/osdmap.c +++ b/src/kernel/osdmap.c @@ -590,8 +590,7 @@ void calc_object_layout(struct ceph_object_layout *ol, } pgid.pg64 = 0; /* start with it zeroed out */ - pgid.pg.ps = ceph_stable_mod(bno + crush_hash32_2(ino, ino>>32), - num, num_mask); + pgid.pg.ps = bno + crush_hash32_2(ino, ino>>32); pgid.pg.preferred = preferred; pgid.pg.type = fl->fl_pg_type; pgid.pg.size = fl->fl_pg_size; diff --git a/src/messages/MOSDOp.h b/src/messages/MOSDOp.h index 13e43f18d97a9..384b6ace7756a 100644 --- a/src/messages/MOSDOp.h +++ b/src/messages/MOSDOp.h @@ -154,6 +154,7 @@ public: << " " << get_opname(get_op()) << " " << head.oid; if (get_length()) out << " " << get_offset() << "~" << get_length(); + out << " " << pg_t(head.layout.ol_pgid); if (is_retry_attempt()) out << " RETRY"; out << ")"; } diff --git a/src/osd/OSD.cc b/src/osd/OSD.cc index 31412b0c3b9d0..a49da183c8ce2 100644 --- a/src/osd/OSD.cc +++ b/src/osd/OSD.cc @@ -719,6 +719,14 @@ void OSD::activate_pg(pg_t pgid, epoch_t epoch) } } + // wake up _all_ pg waiters; raw pg -> actual pg mapping may have shifted + for (hash_map >::iterator p = waiting_for_pg.begin(); + p != waiting_for_pg.end(); + p++) + take_waiters(p->second); + waiting_for_pg.clear(); + + // finishers? finished_lock.Lock(); if (finished.empty()) { @@ -2512,8 +2520,10 @@ void OSD::handle_op(MOSDOp *op) op_queue_cond.Wait(osd_lock); } + // calc actual pgid + pg_t pgid = osdmap->raw_pg_to_pg(op->get_pg()); + // get and lock *pg. - const pg_t pgid = op->get_pg(); PG *pg = _have_pg(pgid) ? _lookup_lock_pg(pgid):0; logger->set("buf", buffer_total_alloc.test()); diff --git a/src/osd/OSDMap.h b/src/osd/OSDMap.h index 58f911a23b809..a5efd8c9ad1ec 100644 --- a/src/osd/OSDMap.h +++ b/src/osd/OSDMap.h @@ -476,26 +476,23 @@ private: } ceph_object_layout make_object_layout(object_t oid, int pg_type, int pg_size, int pg_pool, int preferred=-1, int object_stripe_unit = 0) { - int num = preferred >= 0 ? lpg_num:pg_num; - int num_mask = preferred >= 0 ? lpg_num_mask:pg_num_mask; - // calculate ps (placement seed) - ps_t ps; + ps_t ps; // NOTE: keep full precision, here! switch (g_conf.osd_object_layout) { case CEPH_OBJECT_LAYOUT_LINEAR: - ps = ceph_stable_mod(oid.bno + oid.ino, num, num_mask); + ps = oid.bno + oid.ino; break; case CEPH_OBJECT_LAYOUT_HASHINO: //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.ino>>32), num, num_mask); - ps = ceph_stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); + ps = oid.bno + crush_hash32_2(oid.ino, oid.ino>>32); break; case CEPH_OBJECT_LAYOUT_HASH: //ps = stable_mod(H( (oid.bno & oid.ino) ^ ((oid.bno^oid.ino) >> 32) ), num, num_mask); //ps = stable_mod(H(oid.bno) + H(oid.ino)^H(oid.ino>>32), num, num_mask); //ps = stable_mod(oid.bno + H(oid.bno+oid.ino)^H(oid.bno+oid.ino>>32), num, num_mask); - ps = ceph_stable_mod(oid.bno + crush_hash32_2(oid.ino, oid.ino>>32), num, num_mask); + ps = oid.bno + crush_hash32_2(oid.ino, oid.ino>>32); break; default: @@ -513,15 +510,32 @@ private: } + /* + * map a raw pg (with full precision ps) into an actual pg, for storage + */ + pg_t raw_pg_to_pg(pg_t pg) { + if (pg.preferred() >= 0) + pg.u.pg.ps = ceph_stable_mod(pg.ps(), lpg_num, lpg_num_mask); + else + pg.u.pg.ps = ceph_stable_mod(pg.ps(), pg_num, pg_num_mask); + return pg; + } + + /* + * map raw pg (full precision ps) into a placement ps + */ + ps_t raw_pg_to_pps(pg_t pg) { + if (pg.preferred() >= 0) + return ceph_stable_mod(pg.ps(), lpgp_num, lpgp_num_mask); + else + return ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask); + } + // pg -> (osd list) int pg_to_osds(pg_t pg, vector& osds) { // map to osds[] - ps_t pps; // placement ps - if (pg.preferred() >= 0) - pps = ceph_stable_mod(pg.ps(), lpgp_num, lpgp_num_mask); - else - pps = ceph_stable_mod(pg.ps(), pgp_num, pgp_num_mask); + ps_t pps = raw_pg_to_pps(pg); // placement ps switch (g_conf.osd_pg_layout) { case CEPH_PG_LAYOUT_CRUSH: -- 2.39.5