Summary:
The original goal is to propagate failures from `GetContext::SaveValue()` -> `GetContext::GetBlobValue()` -> `BlobFetcher::FetchBlob()` up to the user. This call sequence happens when a merge chain ends with a base value in a blob file.
There's also fixes for bugs encountered along the way where non-ok statuses were ignored/overwritten, and a bit of plumbing work for functions that had no capability to return a status.
Pull Request resolved: https://github.com/facebook/rocksdb/pull/12462
Test Plan:
A repro command
```
db=/dev/shm/dbstress_db ; exp=/dev/shm/dbstress_exp ; rm -rf $db $exp ; mkdir -p $db $exp
./db_stress \
--clear_column_family_one_in=0 \
--test_batches_snapshots=0 \
--write_fault_one_in=0 \
--use_put_entity_one_in=0 \
--prefixpercent=0 \
--read_fault_one_in=0 \
--readpercent=0 \
--reopen=0 \
--set_options_one_in=10000 \
--delpercent=0 \
--delrangepercent=0 \
--open_metadata_write_fault_one_in=0 \
--open_read_fault_one_in=0 \
--open_write_fault_one_in=0 \
--destroy_db_initially=0 \
--ingest_external_file_one_in=0 \
--iterpercent=0 \
--nooverwritepercent=0 \
--db=$db \
--enable_blob_files=1 \
--expected_values_dir=$exp \
--max_background_compactions=20 \
--max_bytes_for_level_base=
2097152 \
--max_key=100000 \
--min_blob_size=0 \
--open_files=-1 \
--ops_per_thread=
100000000 \
--prefix_size=-1 \
--target_file_size_base=524288 \
--use_merge=1 \
--value_size_mult=32 \
--write_buffer_size=524288 \
--writepercent=100
```
It used to fail like:
```
...
frame https://github.com/facebook/rocksdb/issues/9: 0x00007fc63903bc93 libc.so.6`__GI___assert_fail(assertion="HasDefaultColumn(columns)", file="fbcode/internal_repo_rocksdb/repo/db/wide/wide_columns_helper.h", line=33, function="static const rocksdb::Slice &rocksdb::WideColumnsHelper::GetDefaultColumn(const rocksdb::WideColumns &)") at assert.c:101:3
frame https://github.com/facebook/rocksdb/issues/10: 0x00000000006f7e92 db_stress`rocksdb::Version::Get(rocksdb::ReadOptions const&, rocksdb::LookupKey const&, rocksdb::PinnableSlice*, rocksdb::PinnableWideColumns*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>*, rocksdb::Status*, rocksdb::MergeContext*, unsigned long*, rocksdb::PinnedIteratorsManager*, bool*, bool*, unsigned long*, rocksdb::ReadCallback*, bool*, bool) [inlined] rocksdb::WideColumnsHelper::GetDefaultColumn(columns=size=0) at wide_columns_helper.h:33
frame https://github.com/facebook/rocksdb/issues/11: 0x00000000006f7e76 db_stress`rocksdb::Version::Get(this=0x00007fc5ec763000, read_options=<unavailable>, k=<unavailable>, value=0x0000000000000000, columns=0x00007fc6035fd1d8, timestamp=<unavailable>, status=0x00007fc6035fd250, merge_context=0x00007fc6035fce40, max_covering_tombstone_seq=0x00007fc6035fce90, pinned_iters_mgr=0x00007fc6035fcdf0, value_found=0x0000000000000000, key_exists=0x0000000000000000, seq=0x0000000000000000, callback=0x0000000000000000, is_blob=0x0000000000000000, do_merge=<unavailable>) at version_set.cc:2492
frame https://github.com/facebook/rocksdb/issues/12: 0x000000000051e245 db_stress`rocksdb::DBImpl::GetImpl(this=0x00007fc637a86000, read_options=0x00007fc6035fcf60, key=<unavailable>, get_impl_options=0x00007fc6035fd000) at db_impl.cc:2408
frame https://github.com/facebook/rocksdb/issues/13: 0x000000000050cec2 db_stress`rocksdb::DBImpl::GetEntity(this=0x00007fc637a86000, _read_options=<unavailable>, column_family=<unavailable>, key=0x00007fc6035fd3c8, columns=0x00007fc6035fd1d8) at db_impl.cc:2109
frame https://github.com/facebook/rocksdb/issues/14: 0x000000000074f688 db_stress`rocksdb::(anonymous namespace)::MemTableInserter::MergeCF(this=0x00007fc6035fd450, column_family_id=2, key=0x00007fc6035fd3c8, value=0x00007fc6035fd3a0) at write_batch.cc:2656
frame https://github.com/facebook/rocksdb/issues/15: 0x00000000007476fc db_stress`rocksdb::WriteBatchInternal::Iterate(wb=0x00007fc6035fe698, handler=0x00007fc6035fd450, begin=12, end=<unavailable>) at write_batch.cc:607
frame https://github.com/facebook/rocksdb/issues/16: 0x000000000074d7dd db_stress`rocksdb::WriteBatchInternal::InsertInto(rocksdb::WriteThread::WriteGroup&, unsigned long, rocksdb::ColumnFamilyMemTables*, rocksdb::FlushScheduler*, rocksdb::TrimHistoryScheduler*, bool, unsigned long, rocksdb::DB*, bool, bool, bool) [inlined] rocksdb::WriteBatch::Iterate(this=<unavailable>, handler=0x00007fc6035fd450) const at write_batch.cc:505
frame https://github.com/facebook/rocksdb/issues/17: 0x000000000074d77b db_stress`rocksdb::WriteBatchInternal::InsertInto(write_group=<unavailable>, sequence=<unavailable>, memtables=<unavailable>, flush_scheduler=<unavailable>, trim_history_scheduler=<unavailable>, ignore_missing_column_families=<unavailable>, recovery_log_number=0, db=0x00007fc637a86000, concurrent_memtable_writes=<unavailable>, seq_per_batch=false, batch_per_txn=<unavailable>) at write_batch.cc:3084
frame https://github.com/facebook/rocksdb/issues/18: 0x0000000000631d77 db_stress`rocksdb::DBImpl::PipelinedWriteImpl(this=0x00007fc637a86000, write_options=<unavailable>, my_batch=0x00007fc6035fe698, callback=0x0000000000000000, log_used=<unavailable>, log_ref=0, disable_memtable=<unavailable>, seq_used=0x0000000000000000) at db_impl_write.cc:807
frame https://github.com/facebook/rocksdb/issues/19: 0x000000000062ceeb db_stress`rocksdb::DBImpl::WriteImpl(this=<unavailable>, write_options=<unavailable>, my_batch=0x00007fc6035fe698, callback=0x0000000000000000, log_used=<unavailable>, log_ref=0, disable_memtable=<unavailable>, seq_used=0x0000000000000000, batch_cnt=0, pre_release_callback=0x0000000000000000, post_memtable_callback=0x0000000000000000) at db_impl_write.cc:312
frame https://github.com/facebook/rocksdb/issues/20: 0x000000000062c8ec db_stress`rocksdb::DBImpl::Write(this=0x00007fc637a86000, write_options=0x00007fc6035feca8, my_batch=0x00007fc6035fe698) at db_impl_write.cc:157
frame https://github.com/facebook/rocksdb/issues/21: 0x000000000062b847 db_stress`rocksdb::DB::Merge(this=0x00007fc637a86000, opt=0x00007fc6035feca8, column_family=0x00007fc6370bf140, key=0x00007fc6035fe8d8, value=0x00007fc6035fe830) at db_impl_write.cc:2544
frame https://github.com/facebook/rocksdb/issues/22: 0x000000000062b6ef db_stress`rocksdb::DBImpl::Merge(this=0x00007fc637a86000, o=<unavailable>, column_family=0x00007fc6370bf140, key=0x00007fc6035fe8d8, val=0x00007fc6035fe830) at db_impl_write.cc:72
frame https://github.com/facebook/rocksdb/issues/23: 0x00000000004d6397 db_stress`rocksdb::NonBatchedOpsStressTest::TestPut(this=0x00007fc637041000, thread=0x00007fc6370dbc00, write_opts=0x00007fc6035feca8, read_opts=0x00007fc6035fe9c8, rand_column_families=<unavailable>, rand_keys=size=1, value={P\xe9_\x03\xc6\x7f\0\0}) at no_batched_ops_stress.cc:1317
frame https://github.com/facebook/rocksdb/issues/24: 0x000000000049361d db_stress`rocksdb::StressTest::OperateDb(this=0x00007fc637041000, thread=0x00007fc6370dbc00) at db_stress_test_base.cc:1148
...
```
Reviewed By: ltamasi
Differential Revision:
D55157795
Pulled By: ajkr
fbshipit-source-id:
5f7c1380ead5794c29d41680028e34b839744764
ASSERT_EQ(Get("Key1"), "v1,v2,v3");
}
+TEST_F(DBBlobBasicTest, GetMergeBlobFromMemoryTier) {
+ Options options = GetDefaultOptions();
+ options.merge_operator = MergeOperators::CreateStringAppendOperator();
+ options.enable_blob_files = true;
+ options.min_blob_size = 0;
+
+ Reopen(options);
+
+ ASSERT_OK(Put(Key(0), "v1"));
+ ASSERT_OK(Flush());
+ ASSERT_OK(Merge(Key(0), "v2"));
+ ASSERT_OK(Flush());
+
+ // Regular `Get()` loads data block to cache.
+ std::string value;
+ ASSERT_OK(db_->Get(ReadOptions(), Key(0), &value));
+ ASSERT_EQ("v1,v2", value);
+
+ // Base value blob is still uncached, so an in-memory read will fail.
+ ReadOptions read_options;
+ read_options.read_tier = kBlockCacheTier;
+ ASSERT_TRUE(db_->Get(read_options, Key(0), &value).IsIncomplete());
+}
+
TEST_F(DBBlobBasicTest, MultiGetMergeBlobWithPut) {
constexpr size_t num_keys = 3;
bool TableCache::GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
size_t prefix_size, GetContext* get_context,
- SequenceNumber seq_no) {
+ Status* read_status, SequenceNumber seq_no) {
bool found = false;
row_cache_key.TrimAppend(prefix_size, user_key.data(), user_key.size());
row_cache.RegisterReleaseAsCleanup(row_handle, value_pinner);
// If row cache hit, knowing cache key is the same to row_cache_key,
// can use row_cache_key's seq no to construct InternalKey.
- replayGetContextLog(*row_cache.Value(row_handle), user_key, get_context,
- &value_pinner, seq_no);
+ *read_status = replayGetContextLog(*row_cache.Value(row_handle), user_key,
+ get_context, &value_pinner, seq_no);
RecordTick(ioptions_.stats, ROW_CACHE_HIT);
found = true;
} else {
// Check row cache if enabled.
// Reuse row_cache_key sequence number when row cache hits.
+ Status s;
if (ioptions_.row_cache && !get_context->NeedToReadSequence()) {
auto user_key = ExtractUserKey(k);
uint64_t cache_entry_seq_no =
CreateRowCacheKeyPrefix(options, fd, k, get_context, row_cache_key);
done = GetFromRowCache(user_key, row_cache_key, row_cache_key.Size(),
- get_context, cache_entry_seq_no);
+ get_context, &s, cache_entry_seq_no);
if (!done) {
row_cache_entry = &row_cache_entry_buffer;
}
}
- Status s;
TableReader* t = fd.table_reader;
TypedHandle* handle = nullptr;
- if (!done) {
- assert(s.ok());
+ if (s.ok() && !done) {
if (t == nullptr) {
s = FindTable(options, file_options_, internal_comparator, file_meta,
&handle, block_protection_bytes_per_key, prefix_extractor,
// user key to row_cache_key at offset prefix_size
bool GetFromRowCache(const Slice& user_key, IterKey& row_cache_key,
size_t prefix_size, GetContext* get_context,
+ Status* read_status,
SequenceNumber seq_no = kMaxSequenceNumber);
const ImmutableOptions& ioptions_;
std::string db_session_id_;
};
-} // namespace ROCKSDB_NAMESPACE
\ No newline at end of file
+} // namespace ROCKSDB_NAMESPACE
GetContext* get_context = miter->get_context;
- if (GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
- get_context)) {
+ Status read_status;
+ bool ret =
+ GetFromRowCache(user_key, row_cache_key, row_cache_key_prefix_size,
+ get_context, &read_status);
+ if (!read_status.ok()) {
+ CO_RETURN read_status;
+ }
+ if (ret) {
table_range.SkipKey(miter);
} else {
row_cache_entries.emplace_back();
biter.key(), &parsed_key, false /* log_err_key */); // TODO
if (!pik_status.ok()) {
s = pik_status;
+ break;
}
- if (!get_context->SaveValue(
- parsed_key, biter.value(), &matched,
- biter.IsValuePinned() ? &biter : nullptr)) {
+ Status read_status;
+ bool ret = get_context->SaveValue(
+ parsed_key, biter.value(), &matched, &read_status,
+ biter.IsValuePinned() ? &biter : nullptr);
+ if (!read_status.ok()) {
+ s = read_status;
+ break;
+ }
+ if (!ret) {
if (get_context->State() == GetContext::GetState::kFound) {
does_referenced_key_exist = true;
referenced_data_size = biter.key().size() + biter.value().size();
break;
}
}
- s = biter.status();
+ if (s.ok()) {
+ s = biter.status();
+ }
if (!s.ok()) {
break;
}
biter->key(), &parsed_key, false /* log_err_key */); // TODO
if (!pik_status.ok()) {
s = pik_status;
+ break;
+ }
+ Status read_status;
+ bool ret = get_context->SaveValue(
+ parsed_key, biter->value(), &matched, &read_status,
+ value_pinner ? value_pinner : nullptr);
+ if (!read_status.ok()) {
+ s = read_status;
+ break;
}
- if (!get_context->SaveValue(parsed_key, biter->value(), &matched,
- value_pinner)) {
+ if (!ret) {
if (get_context->State() == GetContext::GetState::kFound) {
does_referenced_key_exist = true;
referenced_data_size =
done = true;
break;
}
- s = biter->status();
+ if (s.ok()) {
+ s = biter->status();
+ }
}
// Write the block cache access.
// XXX: There appear to be 'break' statements above that bypass this
return s;
}
bool dont_care __attribute__((__unused__));
- get_context->SaveValue(found_ikey, value, &dont_care);
+ get_context->SaveValue(found_ikey, value, &dont_care, &s);
+ if (!s.ok()) {
+ return s;
+ }
}
// We don't support merge operations. So, we return here.
return Status::OK();
bool GetContext::SaveValue(const ParsedInternalKey& parsed_key,
const Slice& value, bool* matched,
- Cleanable* value_pinner) {
+ Status* read_status, Cleanable* value_pinner) {
assert(matched);
assert((state_ != kMerge && parsed_key.type != kTypeMerge) ||
merge_context_ != nullptr);
// merge_context_->operand_list
if (type == kTypeBlobIndex) {
PinnableSlice pin_val;
- if (GetBlobValue(parsed_key.user_key, unpacked_value, &pin_val) ==
- false) {
+ if (GetBlobValue(parsed_key.user_key, unpacked_value, &pin_val,
+ read_status) == false) {
return false;
}
Slice blob_value(pin_val);
assert(merge_operator_ != nullptr);
if (type == kTypeBlobIndex) {
PinnableSlice pin_val;
- if (GetBlobValue(parsed_key.user_key, unpacked_value, &pin_val) ==
- false) {
+ if (GetBlobValue(parsed_key.user_key, unpacked_value, &pin_val,
+ read_status) == false) {
return false;
}
Slice blob_value(pin_val);
}
bool GetContext::GetBlobValue(const Slice& user_key, const Slice& blob_index,
- PinnableSlice* blob_value) {
+ PinnableSlice* blob_value, Status* read_status) {
constexpr FilePrefetchBuffer* prefetch_buffer = nullptr;
constexpr uint64_t* bytes_read = nullptr;
- Status status = blob_fetcher_->FetchBlob(
- user_key, blob_index, prefetch_buffer, blob_value, bytes_read);
- if (!status.ok()) {
- if (status.IsIncomplete()) {
+ *read_status = blob_fetcher_->FetchBlob(user_key, blob_index, prefetch_buffer,
+ blob_value, bytes_read);
+ if (!read_status->ok()) {
+ if (read_status->IsIncomplete()) {
// FIXME: this code is not covered by unit tests
MarkKeyMayExist();
return false;
}
}
-void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
- GetContext* get_context, Cleanable* value_pinner,
- SequenceNumber seq_no) {
+Status replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+ GetContext* get_context, Cleanable* value_pinner,
+ SequenceNumber seq_no) {
Slice s = replay_log;
Slice ts;
size_t ts_sz = get_context->TimestampSize();
(void)ret;
- get_context->SaveValue(ikey, value, &dont_care, value_pinner);
+ Status read_status;
+ get_context->SaveValue(ikey, value, &dont_care, &read_status, value_pinner);
+ if (!read_status.ok()) {
+ return read_status;
+ }
}
+ return Status::OK();
}
} // namespace ROCKSDB_NAMESPACE
// Returns True if more keys need to be read (due to merges) or
// False if the complete value has been found.
bool SaveValue(const ParsedInternalKey& parsed_key, const Slice& value,
- bool* matched, Cleanable* value_pinner = nullptr);
+ bool* matched, Status* read_status,
+ Cleanable* value_pinner = nullptr);
// Simplified version of the previous function. Should only be used when we
// know that the operation is a Put.
void MergeWithWideColumnBaseValue(const Slice& entity);
bool GetBlobValue(const Slice& user_key, const Slice& blob_index,
- PinnableSlice* blob_value);
+ PinnableSlice* blob_value, Status* read_status);
void appendToReplayLog(ValueType type, Slice value, Slice ts);
// Call this to replay a log and bring the get_context up to date. The replay
// log must have been created by another GetContext object, whose replay log
// must have been set by calling GetContext::SetReplayLog().
-void replayGetContextLog(const Slice& replay_log, const Slice& user_key,
- GetContext* get_context,
- Cleanable* value_pinner = nullptr,
- SequenceNumber seq_no = kMaxSequenceNumber);
+Status replayGetContextLog(const Slice& replay_log, const Slice& user_key,
+ GetContext* get_context,
+ Cleanable* value_pinner = nullptr,
+ SequenceNumber seq_no = kMaxSequenceNumber);
} // namespace ROCKSDB_NAMESPACE
}
bool dont_care __attribute__((__unused__));
- if (!get_context->SaveValue(parsed_key, iter->value(), &dont_care)) {
+ Status read_status;
+ bool ret = get_context->SaveValue(parsed_key, iter->value(), &dont_care,
+ &read_status);
+ if (!read_status.ok()) {
+ return read_status;
+ }
+ if (!ret) {
break;
}
}
// can we enable the fast path?
if (internal_comparator_.Compare(found_key, parsed_target) >= 0) {
bool dont_care __attribute__((__unused__));
- if (!get_context->SaveValue(found_key, found_value, &dont_care,
- dummy_cleanable_.get())) {
+ bool ret = get_context->SaveValue(found_key, found_value, &dont_care, &s,
+ dummy_cleanable_.get());
+ if (!s.ok()) {
+ return s;
+ }
+ if (!ret) {
break;
}
}
--- /dev/null
+* Fixed `kBlockCacheTier` reads to return `Status::Incomplete` when I/O is needed to fetch a merge chain's base value from a blob file.