From 59b6890ff0bb0fcd3e255514dd0127b1e7f136d8 Mon Sep 17 00:00:00 2001 From: Josh Durgin Date: Fri, 31 Jul 2020 18:18:25 -0400 Subject: [PATCH] osd/PGLog: add some documentation Signed-off-by: Josh Durgin --- src/osd/PGLog.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h index 9d3a3be6ac838..89da871e9b891 100644 --- a/src/osd/PGLog.h +++ b/src/osd/PGLog.h @@ -29,6 +29,80 @@ #include "crimson/os/cyanstore/cyan_collection.h" #endif +/** @name PG Log + * + * The pg log serves three primary purposes: + * + * 1) improving recovery speed + * + * 2) detecting duplicate ops + * + * 3) making erasure coded updates safe + * + * For (1), the main data type is pg_log_entry_t. this is indexed in + * memory by the IndexedLog class - this is where most of the logic + * surrounding pg log is kept, even though the low level types are in + * src/osd/osd_types.h + * + * (2) uses a type which is a subset of the full log entry, containing + * just the pieces we need to identify and respond to a duplicate + * request. + * + * As we trim the log, we convert pg_log_entry_t to smaller + * pg_log_dup_t, and finally remove them once we reach a higher + * limit. This is controlled by a few options: + * + * osd_min_pg_log_entries osd_max_pg_log_entries + * osd_pg_log_dups_tracked + * + * For example, with a min of 100, max of 1000, and dups tracked of + * 3000, the log entries and dups stored would span the following + * versions, assuming the current earliest is version 1: + * + * version: 3000 2001 2000 1 [ pg log entries ] [ pg log dups ] + * + * after osd_pg_log_trim_min subsequent writes to this PG, the log + * would be trimmed to look like: + * + * version: 3100 2101 2100 101 [ pg log entries ] [ pg log dups ] + * + * (3) means tracking the previous state of an object, so that we can + * rollback to that prior state if necessary. It's only used for + * erasure coding. Consider an erasure code of 4+2, for example. + * + * This means we split the object into 4 pieces (called shards) and + * compute 2 parity shards. Each of these shards is stored on a + * separate OSD. As long as 4 shards are the same version, we can + * recover the remaining 2 by computation. Imagine during a write, 3 + * of the osds go down and restart, resulting in shards 0,1,2 + * reflecting version A and shards 3,4,5 reflecting version B, after + * the write. + * + * If we had no way to reconstruct version A for another shard, we + * would have lost the object. + * + * The actual data for rollback is stored in a look-aside object and + * is removed once the EC write commits on all shards. The pg log just + * stores the versions so we can tell how far we can rollback, and a + * description of the type of operation for each log entry. Beyond + * the pg log, see PGBackend::Trimmer and PGBackend::RollbackVisitor + * for more details on this. + * + * An important implication of this is that although the pg log length + * is normally bounded, under extreme conditions, with many EC I/Os + * outstanding, the log may grow beyond that point because we need to + * keep the rollback information for all outstanding EC I/O. + * + * For more on pg log bounds, see where it is calculated in + * PeeringState::calc_trim_to_aggressive(). + * + * For more details on how peering uses the pg log, and architectural + * reasons for its existence, see: + * + * doc/dev/osd_internals/log_based_pg.rst + * + */ + constexpr auto PGLOG_INDEXED_OBJECTS = 1 << 0; constexpr auto PGLOG_INDEXED_CALLER_OPS = 1 << 1; constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2; -- 2.39.5