There's a race between the client and osd with a newly marked full
osdmap. If the client gets the new map first, it blocks writes and
everything works as expected, with no errors from the osd.
If the osd gets the map first, however, it will respond to any writes
with -ENOSPC. Clients will pass this up the stack, and not retry these
writes later. -ENOSPC isn't handled well by all clients. RBD, for
example, may pass it on to qemu or kernel rbd which will both
interpret it as EIO. Filesystems on top of rbd will not behave well
when they receive EIOs like this, especially if the cluster oscillates
between full and not full, so some writes succeed.
To fix this, never return ENOSPC from the osd because of a map marked
full, and rely on the client to retry all writes when the map is no
longer marked full.
Old clients talking to osds with this fix will hang instead of
propagating an error, but only if they run into this race
condition. ceph-fuse and rbd with caching enabled are not affected,
since the ObjectCacher will retry writes that return errors.
Refs: #6938
Backport: dumpling, emperor
Signed-off-by: Josh Durgin <josh.durgin@inktank.com>
(cherry picked from commit
4111729dda7437c23f59e7100b3c4a9ec4101dd0)
ObjectStore::Transaction &t = *_t;
// store new maps: queue for disk and put in the osdmap cache
+ epoch_t last_marked_full = 0;
epoch_t start = MAX(osdmap->get_epoch() + 1, first);
for (epoch_t e = start; e <= last; e++) {
map<epoch_t,bufferlist>::iterator p;
bufferlist& bl = p->second;
o->decode(bl);
+ if (o->test_flag(CEPH_OSDMAP_FULL))
+ last_marked_full = e;
pinned_maps.push_back(add_map(o));
hobject_t fulloid = get_osdmap_pobject_name(e);
assert(0 == "bad fsid");
}
+ if (o->test_flag(CEPH_OSDMAP_FULL))
+ last_marked_full = e;
pinned_maps.push_back(add_map(o));
bufferlist fbl;
superblock.oldest_map = first;
superblock.newest_map = last;
+ if (last_marked_full > superblock.last_map_marked_full)
+ superblock.last_map_marked_full = last_marked_full;
map_lock.get_write();
if (op->may_write()) {
// full?
if ((service.check_failsafe_full() ||
- osdmap->test_flag(CEPH_OSDMAP_FULL)) &&
+ osdmap->test_flag(CEPH_OSDMAP_FULL) ||
+ m->get_map_epoch() < superblock.last_map_marked_full) &&
!m->get_source().is_mds()) { // FIXME: we'll exclude mds writes for now.
- service.reply_op_error(op, -ENOSPC);
+ // Drop the request, since the client will retry when the full
+ // flag is unset.
return;
}
void OSDSuperblock::encode(bufferlist &bl) const
{
- ENCODE_START(5, 5, bl);
+ ENCODE_START(6, 5, bl);
::encode(cluster_fsid, bl);
::encode(whoami, bl);
::encode(current_epoch, bl);
::encode(clean_thru, bl);
::encode(mounted, bl);
::encode(osd_fsid, bl);
+ ::encode(last_map_marked_full, bl);
ENCODE_FINISH(bl);
}
void OSDSuperblock::decode(bufferlist::iterator &bl)
{
- DECODE_START_LEGACY_COMPAT_LEN(5, 5, 5, bl);
+ DECODE_START_LEGACY_COMPAT_LEN(6, 5, 5, bl);
if (struct_v < 3) {
string magic;
::decode(magic, bl);
::decode(mounted, bl);
if (struct_v >= 4)
::decode(osd_fsid, bl);
+ if (struct_v >= 6)
+ ::decode(last_map_marked_full, bl);
DECODE_FINISH(bl);
}
f->close_section();
f->dump_int("clean_thru", clean_thru);
f->dump_int("last_epoch_mounted", mounted);
+ f->dump_int("last_map_marked_full", last_map_marked_full);
}
void OSDSuperblock::generate_test_instances(list<OSDSuperblock*>& o)
z.mounted = 8;
z.clean_thru = 7;
o.push_back(new OSDSuperblock(z));
+ z.last_map_marked_full = 7;
+ o.push_back(new OSDSuperblock(z));
}
// -- SnapSet --
// last interval over which i mounted and was then active
epoch_t mounted; // last epoch i mounted
epoch_t clean_thru; // epoch i was active and clean thru
+ epoch_t last_map_marked_full; // last epoch osdmap was marked full
OSDSuperblock() :
whoami(-1),
current_epoch(0), oldest_map(0), newest_map(0), weight(0),
- mounted(0), clean_thru(0) {
+ mounted(0), clean_thru(0), last_map_marked_full(0) {
}
void encode(bufferlist &bl) const;