From: Aravind Ramesh Date: Tue, 26 Jul 2022 09:52:17 +0000 (+0530) Subject: crimson/zns: advance write pointer before writing tail-info. X-Git-Tag: v18.0.0~94^2~8 X-Git-Url: http://git-server-git.apps.pok.os.sepia.ceph.com/?a=commitdiff_plain;h=773d863019e83d0bf03916b408b63db7cde71a2b;p=ceph.git crimson/zns: advance write pointer before writing tail-info. SegmentAllocator::close_segment() writes tail information to a segment before closing the segment, and this is written at the end of segment. However, for ZNS SSDs, the writes have to always happen at write pointer, so writing tail info at the end of a zone fails if the WP is not at the offset requested by close_segment(). If the write pointer is not at lba where the tail information is written, then advance write pointer by writing zeroes to the zone from it's current write pointer. Then write the tail information at the end of zone. Added advance_wp() function which advances the write pointer and then write tail information, in case of ZNS devices but for a regular device it continues to write at the end of segment. Do close_segment() call after writing tail information, closing a segment first and then writing tail information can cause potential race conditions on a zns backed segment. Signed-off-by: Aravind Ramesh --- diff --git a/src/crimson/os/seastore/journal/segment_allocator.cc b/src/crimson/os/seastore/journal/segment_allocator.cc index 28f7a30cfe5..4607d056586 100644 --- a/src/crimson/os/seastore/journal/segment_allocator.cc +++ b/src/crimson/os/seastore/journal/segment_allocator.cc @@ -219,7 +219,6 @@ SegmentAllocator::close_segment() // Note: make sure no one can access the current segment once closing auto seg_to_close = std::move(current_segment); auto close_segment_id = seg_to_close->get_segment_id(); - segment_provider.close_segment(close_segment_id); auto close_seg_info = segment_provider.get_seg_info(close_segment_id); ceph_assert((close_seg_info.modify_time == NULL_TIME && close_seg_info.num_extents == 0) || @@ -247,17 +246,25 @@ SegmentAllocator::close_segment() bl.append(bp); assert(bl.length() == sm_group.get_rounded_tail_length()); - return seg_to_close->write( - sm_group.get_segment_size() - sm_group.get_rounded_tail_length(), - bl - ).safe_then([seg_to_close=std::move(seg_to_close)] { - return seg_to_close->close(); + + auto p_seg_to_close = seg_to_close.get(); + return p_seg_to_close->advance_wp( + sm_group.get_segment_size() - sm_group.get_rounded_tail_length() + ).safe_then([this, FNAME, bl=std::move(bl), p_seg_to_close]() mutable { + DEBUG("Writing tail info to segment {}", p_seg_to_close->get_segment_id()); + return p_seg_to_close->write( + sm_group.get_segment_size() - sm_group.get_rounded_tail_length(), + std::move(bl)); + }).safe_then([p_seg_to_close] { + return p_seg_to_close->close(); + }).safe_then([this, seg_to_close=std::move(seg_to_close)] { + segment_provider.close_segment(seg_to_close->get_segment_id()); }).handle_error( close_segment_ertr::pass_further{}, - crimson::ct_error::assert_all{ - "Invalid error in SegmentAllocator::close_segment" - } - ); + crimson::ct_error::assert_all { + "Invalid error in SegmentAllocator::close_segment" + }); + } RecordBatch::add_pending_ret diff --git a/src/crimson/os/seastore/segment_manager.h b/src/crimson/os/seastore/segment_manager.h index b565125c50b..e14ef0d8d07 100644 --- a/src/crimson/os/seastore/segment_manager.h +++ b/src/crimson/os/seastore/segment_manager.h @@ -128,6 +128,16 @@ public: virtual write_ertr::future<> write( seastore_off_t offset, ceph::bufferlist bl) = 0; + /** + * advance_wp + * + * advance the segment write pointer, + * needed when writing at wp is strictly implemented. ex: ZNS backed segments + * @param offset: advance write pointer till the given offset + */ + virtual write_ertr::future<> advance_wp( + seastore_off_t offset) = 0; + virtual ~Segment() {} }; using SegmentRef = boost::intrusive_ptr; diff --git a/src/crimson/os/seastore/segment_manager/block.cc b/src/crimson/os/seastore/segment_manager/block.cc index e25feadc72b..7b4b004dd95 100644 --- a/src/crimson/os/seastore/segment_manager/block.cc +++ b/src/crimson/os/seastore/segment_manager/block.cc @@ -421,6 +421,11 @@ Segment::write_ertr::future<> BlockSegment::write( return manager.segment_write(paddr, bl); } +Segment::write_ertr::future<> BlockSegment::advance_wp( + seastore_off_t offset) { + return write_ertr::now(); +} + Segment::close_ertr::future<> BlockSegmentManager::segment_close( segment_id_t id, seastore_off_t write_pointer) { diff --git a/src/crimson/os/seastore/segment_manager/block.h b/src/crimson/os/seastore/segment_manager/block.h index 6150f8c177d..765d78e881f 100644 --- a/src/crimson/os/seastore/segment_manager/block.h +++ b/src/crimson/os/seastore/segment_manager/block.h @@ -97,6 +97,7 @@ public: seastore_off_t get_write_ptr() const final { return write_pointer; } close_ertr::future<> close() final; write_ertr::future<> write(seastore_off_t offset, ceph::bufferlist bl) final; + write_ertr::future<> advance_wp(seastore_off_t offset) final; ~BlockSegment() {} }; diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.cc b/src/crimson/os/seastore/segment_manager/ephemeral.cc index 919155bebd1..6a97a513033 100644 --- a/src/crimson/os/seastore/segment_manager/ephemeral.cc +++ b/src/crimson/os/seastore/segment_manager/ephemeral.cc @@ -87,6 +87,12 @@ Segment::write_ertr::future<> EphemeralSegment::write( return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl); } +Segment::write_ertr::future<> EphemeralSegment::advance_wp( + seastore_off_t offset) +{ + return write_ertr::now(); +} + Segment::close_ertr::future<> EphemeralSegmentManager::segment_close(segment_id_t id) { auto s_id = id.device_segment_id(); diff --git a/src/crimson/os/seastore/segment_manager/ephemeral.h b/src/crimson/os/seastore/segment_manager/ephemeral.h index e282943626f..7f69e60b40e 100644 --- a/src/crimson/os/seastore/segment_manager/ephemeral.h +++ b/src/crimson/os/seastore/segment_manager/ephemeral.h @@ -48,6 +48,7 @@ public: seastore_off_t get_write_ptr() const final { return write_pointer; } close_ertr::future<> close() final; write_ertr::future<> write(seastore_off_t offset, ceph::bufferlist bl) final; + write_ertr::future<> advance_wp(seastore_off_t offset) final; ~EphemeralSegment() {} }; diff --git a/src/crimson/os/seastore/segment_manager/zns.cc b/src/crimson/os/seastore/segment_manager/zns.cc index 95c35b72731..99b56d09c09 100644 --- a/src/crimson/os/seastore/segment_manager/zns.cc +++ b/src/crimson/os/seastore/segment_manager/zns.cc @@ -14,6 +14,8 @@ SET_SUBSYS(seastore_device); #define SECT_SHIFT 9 #define RESERVED_ZONES 1 +// limit the max padding buf size to 1MB +#define MAX_PADDING_SIZE 1048576 namespace crimson::os::seastore::segment_manager::zns { @@ -428,7 +430,7 @@ ZNSSegmentManager::open_ertr::future ZNSSegmentManager::open( ); } ).safe_then([=] { - DEBUG("segment open successful"); + DEBUG("segment {}, open successful", id); return open_ertr::future( open_ertr::ready_future_marker{}, SegmentRef(new ZNSSegment(*this, id)) @@ -591,9 +593,9 @@ Segment::write_ertr::future<> ZNSSegment::write( { LOG_PREFIX(ZNSSegment::write); if (offset != write_pointer || offset % manager.metadata.block_size != 0) { - ERROR("Invalid segment write on segment {} to offset {}", - id, - offset); + ERROR("Segment offset and zone write pointer mismatch. " + "segment {} segment-offset {} write pointer {}", + id, offset, write_pointer); return crimson::ct_error::invarg::make(); } if (offset + bl.length() > manager.metadata.segment_size) @@ -603,4 +605,56 @@ Segment::write_ertr::future<> ZNSSegment::write( return manager.segment_write(paddr_t::make_seg_paddr(id, offset), bl); } +Segment::write_ertr::future<> ZNSSegment::write_padding_bytes( + size_t padding_bytes) +{ + LOG_PREFIX(ZNSSegment::write_padding_bytes); + DEBUG("Writing {} padding bytes to segment {} at wp {}", + padding_bytes, id, write_pointer); + + return crimson::repeat([FNAME, padding_bytes, this] () mutable { + size_t bufsize = 0; + if (padding_bytes >= MAX_PADDING_SIZE) { + bufsize = MAX_PADDING_SIZE; + } else { + bufsize = padding_bytes; + } + + padding_bytes -= bufsize; + bufferptr bp(ceph::buffer::create_page_aligned(bufsize)); + bp.zero(); + bufferlist padd_bl; + padd_bl.append(bp); + return write(write_pointer, padd_bl).safe_then([FNAME, padding_bytes, this]() { + if (padding_bytes == 0) { + return write_ertr::make_ready_future(seastar::stop_iteration::yes); + } else { + return write_ertr::make_ready_future(seastar::stop_iteration::no); + } + }); + }); +} + +// Advance write pointer, to given offset. +Segment::write_ertr::future<> ZNSSegment::advance_wp( + seastore_off_t offset) +{ + LOG_PREFIX(ZNSSegment::advance_wp); + + DEBUG("Advancing write pointer from {} to {}", write_pointer, offset); + if (offset < write_pointer) { + return crimson::ct_error::invarg::make(); + } + + size_t padding_bytes = offset - write_pointer; + + if (padding_bytes == 0) { + return write_ertr::now(); + } + + assert(padding_bytes % manager.metadata.block_size == 0); + + return write_padding_bytes(padding_bytes); +} + } diff --git a/src/crimson/os/seastore/segment_manager/zns.h b/src/crimson/os/seastore/segment_manager/zns.h index 6eb13ac2a95..6b18f950603 100644 --- a/src/crimson/os/seastore/segment_manager/zns.h +++ b/src/crimson/os/seastore/segment_manager/zns.h @@ -73,6 +73,7 @@ namespace crimson::os::seastore::segment_manager::zns { seastore_off_t get_write_ptr() const final { return write_pointer; } close_ertr::future<> close() final; write_ertr::future<> write(seastore_off_t offset, ceph::bufferlist bl) final; + write_ertr::future<> advance_wp(seastore_off_t offset) final; ~ZNSSegment() {} private: @@ -80,6 +81,7 @@ namespace crimson::os::seastore::segment_manager::zns { ZNSSegmentManager &manager; const segment_id_t id; seastore_off_t write_pointer = 0; + write_ertr::future<> write_padding_bytes(size_t padding_bytes); }; class ZNSSegmentManager final : public SegmentManager{