From 59b6890ff0bb0fcd3e255514dd0127b1e7f136d8 Mon Sep 17 00:00:00 2001
From: Josh Durgin <jdurgin@redhat.com>
Date: Fri, 31 Jul 2020 18:18:25 -0400
Subject: [PATCH] osd/PGLog: add some documentation

Signed-off-by: Josh Durgin <jdurgin@redhat.com>
---
 src/osd/PGLog.h | 74 +++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/src/osd/PGLog.h b/src/osd/PGLog.h
index 9d3a3be6ac838..89da871e9b891 100644
--- a/src/osd/PGLog.h
+++ b/src/osd/PGLog.h
@@ -29,6 +29,80 @@
 #include "crimson/os/cyanstore/cyan_collection.h"
 #endif
 
+/** @name PG Log
+ *
+ * The pg log serves three primary purposes:
+ *
+ * 1) improving recovery speed
+ *
+ * 2) detecting duplicate ops
+ *
+ * 3) making erasure coded updates safe
+ *
+ * For (1), the main data type is pg_log_entry_t.  this is indexed in
+ * memory by the IndexedLog class - this is where most of the logic
+ * surrounding pg log is kept, even though the low level types are in
+ * src/osd/osd_types.h
+ *
+ * (2) uses a type which is a subset of the full log entry, containing
+ * just the pieces we need to identify and respond to a duplicate
+ * request.
+ *
+ * As we trim the log, we convert pg_log_entry_t to smaller
+ * pg_log_dup_t, and finally remove them once we reach a higher
+ * limit. This is controlled by a few options:
+ *
+ * osd_min_pg_log_entries osd_max_pg_log_entries
+ * osd_pg_log_dups_tracked
+ *
+ * For example, with a min of 100, max of 1000, and dups tracked of
+ * 3000, the log entries and dups stored would span the following
+ * versions, assuming the current earliest is version 1:
+ *
+ *   version: 3000 2001 2000 1 [ pg log entries ] [ pg log dups ]
+ *
+ * after osd_pg_log_trim_min subsequent writes to this PG, the log
+ * would be trimmed to look like:
+ *
+ *   version: 3100 2101 2100 101 [ pg log entries ] [ pg log dups ]
+ *
+ * (3) means tracking the previous state of an object, so that we can
+ * rollback to that prior state if necessary. It's only used for
+ * erasure coding. Consider an erasure code of 4+2, for example.
+ *
+ * This means we split the object into 4 pieces (called shards) and
+ * compute 2 parity shards. Each of these shards is stored on a
+ * separate OSD. As long as 4 shards are the same version, we can
+ * recover the remaining 2 by computation. Imagine during a write, 3
+ * of the osds go down and restart, resulting in shards 0,1,2
+ * reflecting version A and shards 3,4,5 reflecting version B, after
+ * the write.
+ *
+ * If we had no way to reconstruct version A for another shard, we
+ * would have lost the object.
+ *
+ * The actual data for rollback is stored in a look-aside object and
+ * is removed once the EC write commits on all shards. The pg log just
+ * stores the versions so we can tell how far we can rollback, and a
+ * description of the type of operation for each log entry.  Beyond
+ * the pg log, see PGBackend::Trimmer and PGBackend::RollbackVisitor
+ * for more details on this.
+ *
+ * An important implication of this is that although the pg log length
+ * is normally bounded, under extreme conditions, with many EC I/Os
+ * outstanding, the log may grow beyond that point because we need to
+ * keep the rollback information for all outstanding EC I/O.
+ *
+ * For more on pg log bounds, see where it is calculated in
+ * PeeringState::calc_trim_to_aggressive().
+ *
+ * For more details on how peering uses the pg log, and architectural
+ * reasons for its existence, see:
+ *
+ *   doc/dev/osd_internals/log_based_pg.rst
+ *
+ */
+
 constexpr auto PGLOG_INDEXED_OBJECTS          = 1 << 0;
 constexpr auto PGLOG_INDEXED_CALLER_OPS       = 1 << 1;
 constexpr auto PGLOG_INDEXED_EXTRA_CALLER_OPS = 1 << 2;
-- 
2.39.5