Targeted optimizations for BlockSTMv2 (#17336)

gelash · web-flow · commit 1d1654c0adbc · 2025-09-10T14:41:35.000Z
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/Cargo.toml b/Cargo.toml
@@ -728,6 +728,7 @@ quote = "1.0.18"
 rand = "0.7.3"
 rand_core = "0.5.1"
 random_word = "0.3.0"
+rapidhash = "1.4.0"
 rayon = "1.5.2"
 redis = { version = "0.22.3", features = [
     "tokio-comp",
diff --git a/aptos-move/block-executor/src/executor.rs b/aptos-move/block-executor/src/executor.rs
@@ -137,24 +137,26 @@ where
         }
     }
 
+    // The bool in the result indicates whether execution result is a speculative abort.
     fn process_execution_result<'a>(
         execution_result: &'a ExecutionStatus<E::Output, E::Error>,
         read_set: &mut CapturedReads<T, ModuleId, CompiledModule, Module, AptosModuleExtension>,
         txn_idx: TxnIndex,
-    ) -> Result<Option<&'a E::Output>, PanicError> {
+    ) -> Result<(Option<&'a E::Output>, bool), PanicError> {
         match execution_result {
             ExecutionStatus::Success(output) | ExecutionStatus::SkipRest(output) => {
-                Ok(Some(output))
+                Ok((Some(output), false))
             },
             ExecutionStatus::SpeculativeExecutionAbortError(_msg) => {
                 // TODO(BlockSTMv2): cleaner to rename or distinguish V2 early abort
-                // from DeltaApplicationFailure.
+                // from DeltaApplicationFailure. This is also why we return the bool
+                // separately for now instead of relying on the read set.
                 read_set.capture_delayed_field_read_error(&PanicOr::Or(
                     MVDelayedFieldsError::DeltaApplicationFailure,
                 ));
-                Ok(None)
+                Ok((None, true))
             },
-            ExecutionStatus::Abort(_err) => Ok(None),
+            ExecutionStatus::Abort(_err) => Ok((None, false)),
             ExecutionStatus::DelayedFieldsCodeInvariantError(msg) => {
                 Err(code_invariant_error(format!(
                     "[Execution] At txn {}, failed with DelayedFieldsCodeInvariantError: {:?}",
@@ -365,9 +367,19 @@ where
             )));
         }
 
-        let maybe_output =
+        let (maybe_output, is_speculative_failure) =
             Self::process_execution_result(&execution_result, &mut read_set, idx_to_execute)?;
 
+        if is_speculative_failure {
+            // Recording in order to check the invariant that the final, committed incarnation
+            // of each transaction is not a speculative failure.
+            last_input_output.record_speculative_failure(idx_to_execute);
+            // Ignoring module validation requirements since speculative failure
+            // anyway requires re-execution.
+            let _ = scheduler.finish_execution(abort_manager)?;
+            return Ok(());
+        }
+
         Self::process_delayed_field_output(
             maybe_output,
             idx_to_execute,
@@ -510,7 +522,7 @@ where
                 idx_to_execute, incarnation
             )));
         }
-        let processed_output =
+        let (processed_output, _) =
             Self::process_execution_result(&execution_result, &mut read_set, idx_to_execute)?;
 
         let mut prev_modified_resource_keys = last_input_output
@@ -666,29 +678,31 @@ where
         // 2. The only possible time to take the read-set from txn_last_input_output
         // is in prepare_and_queue_commit_ready_txn (applying module publishing output).
         // However, required module validation necessarily occurs before the commit.
-        let read_set = last_input_output.read_set(idx_to_validate).ok_or_else(|| {
-            code_invariant_error(format!(
-                "Prior read-set of txn {} incarnation {} not recorded for module verification",
-                idx_to_validate, incarnation_to_validate
-            ))
-        })?;
+        let (read_set, is_speculative_failure) =
+            last_input_output.read_set(idx_to_validate).ok_or_else(|| {
+                code_invariant_error(format!(
+                    "Prior read-set of txn {} incarnation {} not recorded for module verification",
+                    idx_to_validate, incarnation_to_validate
+                ))
+            })?;
         // Perform invariant checks or return early based on read set's incarnation.
         let blockstm_v2_incarnation = read_set.blockstm_v2_incarnation().ok_or_else(|| {
             code_invariant_error(
                 "BlockSTMv2 must be enabled in CapturedReads when validating module reads",
             )
         })?;
+        if blockstm_v2_incarnation > incarnation_to_validate || is_speculative_failure {
+            // No need to validate as a newer incarnation has already been executed
+            // and recorded its output, or the incarnation has resulted in a speculative
+            // failure, which means there will be a further re-execution.
+            return Ok(true);
+        }
         if blockstm_v2_incarnation < incarnation_to_validate {
             return Err(code_invariant_error(format!(
                 "For txn_idx {}, read set incarnation {} < incarnation to validate {}",
                 idx_to_validate, blockstm_v2_incarnation, incarnation_to_validate
             )));
         }
-        if blockstm_v2_incarnation > incarnation_to_validate {
-            // No need to validate as a newer incarnation has already been executed
-            // and recorded its output.
-            return Ok(true);
-        }
 
         if !read_set.validate_module_reads(
             global_module_cache,
@@ -715,10 +729,14 @@ where
         skip_module_reads_validation: bool,
     ) -> bool {
         let _timer = TASK_VALIDATE_SECONDS.start_timer();
-        let read_set = last_input_output
+        let (read_set, is_speculative_failure) = last_input_output
             .read_set(idx_to_validate)
             .expect("[BlockSTM]: Prior read-set must be recorded");
 
+        if is_speculative_failure {
+            return false;
+        }
+
         assert!(
             !read_set.is_incorrect_use(),
             "Incorrect use must be handled after execution"
@@ -775,10 +793,14 @@ where
         last_input_output: &TxnLastInputOutput<T, E::Output, E::Error>,
         is_v2: bool,
     ) -> Result<bool, PanicError> {
-        let read_set = last_input_output
+        let (read_set, is_speculative_failure) = last_input_output
             .read_set(txn_idx)
             .ok_or_else(|| code_invariant_error("Read set must be recorded"))?;
 
+        if is_speculative_failure {
+            return Ok(false);
+        }
+
         if !read_set.validate_delayed_field_reads(versioned_cache.delayed_fields(), txn_idx)?
             || (is_v2
                 && !read_set.validate_aggregator_v1_reads(
diff --git a/aptos-move/block-executor/src/txn_last_input_output.rs b/aptos-move/block-executor/src/txn_last_input_output.rs
@@ -31,7 +31,10 @@ use std::{
     collections::{BTreeSet, HashSet},
     fmt::Debug,
     iter::{empty, Iterator},
-    sync::Arc,
+    sync::{
+        atomic::{AtomicBool, Ordering},
+        Arc,
+    },
 };
 
 type TxnInput<T> = CapturedReads<T, ModuleId, CompiledModule, Module, AptosModuleExtension>;
@@ -66,6 +69,9 @@ pub struct TxnLastInputOutput<T: Transaction, O: TransactionOutput<Txn = T>, E:
 
     // TODO: Consider breaking down the outputs when storing (avoid traversals, cache below).
     outputs: Vec<CachePadded<ArcSwapOption<ExecutionStatus<O, E>>>>, // txn_idx -> output.
+    // Used to record if the latest incarnation of a txn was a failure due to the
+    // speculative nature of parallel execution.
+    speculative_failures: Vec<CachePadded<AtomicBool>>,
 }
 
 impl<T: Transaction, O: TransactionOutput<Txn = T>, E: Debug + Send + Clone>
@@ -81,6 +87,9 @@ impl<T: Transaction, O: TransactionOutput<Txn = T>, E: Debug + Send + Clone>
             outputs: (0..num_txns)
                 .map(|_| CachePadded::new(ArcSwapOption::empty()))
                 .collect(),
+            speculative_failures: (0..num_txns)
+                .map(|_| CachePadded::new(AtomicBool::new(false)))
+                .collect(),
         }
     }
 
@@ -90,10 +99,15 @@ impl<T: Transaction, O: TransactionOutput<Txn = T>, E: Debug + Send + Clone>
         input: TxnInput<T>,
         output: ExecutionStatus<O, E>,
     ) {
+        self.speculative_failures[txn_idx as usize].store(false, Ordering::Relaxed);
         self.inputs[txn_idx as usize].store(Some(Arc::new(input)));
         self.outputs[txn_idx as usize].store(Some(Arc::new(output)));
     }
 
+    pub(crate) fn record_speculative_failure(&self, txn_idx: TxnIndex) {
+        self.speculative_failures[txn_idx as usize].store(true, Ordering::Relaxed);
+    }
+
     pub fn fetch_exchanged_data(
         &self,
         key: &T::Key,
@@ -119,8 +133,13 @@ impl<T: Transaction, O: TransactionOutput<Txn = T>, E: Debug + Send + Clone>
         )
     }
 
-    pub(crate) fn read_set(&self, txn_idx: TxnIndex) -> Option<Arc<TxnInput<T>>> {
-        self.inputs[txn_idx as usize].load_full()
+    // Alongside the latest read set, returns the indicator of whether the latest
+    // incarnation of the txn resulted in a speculative failure.
+    pub(crate) fn read_set(&self, txn_idx: TxnIndex) -> Option<(Arc<TxnInput<T>>, bool)> {
+        let input = self.inputs[txn_idx as usize].load_full()?;
+        let speculative_failure =
+            self.speculative_failures[txn_idx as usize].load(Ordering::Relaxed);
+        Some((input, speculative_failure))
     }
 
     // Should be called when txn_idx is committed, while holding commit lock.
@@ -539,7 +558,7 @@ impl<T: Transaction, O: TransactionOutput<Txn = T>, E: Debug + Send + Clone>
     }
 
     pub(crate) fn get_txn_read_write_summary(&self, txn_idx: TxnIndex) -> ReadWriteSummary<T> {
-        let read_set = self.read_set(txn_idx).expect("Read set must be recorded");
+        let read_set = self.read_set(txn_idx).expect("Read set must be recorded").0;
 
         let reads = read_set.get_read_summary();
         let writes = self.get_write_summary(txn_idx);
diff --git a/types/Cargo.toml b/types/Cargo.toml
@@ -57,6 +57,7 @@ proptest = { workspace = true, optional = true }
 proptest-derive = { workspace = true, optional = true }
 quick_cache = { workspace = true }
 rand = { workspace = true }
+rapidhash = { workspace = true }
 rayon = { workspace = true }
 ref-cast = { workspace = true }
 ring = { workspace = true }
diff --git a/types/src/state_store/state_value.rs b/types/src/state_store/state_value.rs
@@ -10,6 +10,7 @@ use aptos_crypto_derive::{BCSCryptoHash, CryptoHasher};
 use bytes::Bytes;
 #[cfg(any(test, feature = "fuzzing"))]
 use proptest::{arbitrary::Arbitrary, collection::vec, prelude::*};
+use rapidhash::rapidhash;
 use serde::{Deserialize, Deserializer, Serialize, Serializer};
 
 #[derive(Deserialize, Serialize)]
@@ -178,14 +179,34 @@ impl PersistedStateValue {
     }
 }
 
-#[derive(BCSCryptoHash, Clone, CryptoHasher, Debug, Eq, PartialEq)]
+#[derive(Clone, Debug, BCSCryptoHash, CryptoHasher)]
 pub struct StateValue {
     data: Bytes,
     metadata: StateValueMetadata,
+    maybe_rapid_hash: Option<(u64, usize)>,
 }
 
+impl PartialEq for StateValue {
+    fn eq(&self, other: &Self) -> bool {
+        // Fast path: if both have rapid hashes and they differ, values can't be equal
+        if let (Some(hash1), Some(hash2)) = (&self.maybe_rapid_hash, &other.maybe_rapid_hash) {
+            if hash1 != hash2 {
+                return false;
+            }
+        }
+
+        // Full comparison: data and metadata
+        self.data == other.data && self.metadata == other.metadata
+    }
+}
+
+impl Eq for StateValue {}
+
 pub const ARB_STATE_VALUE_MAX_SIZE: usize = 100;
 
+/// Threshold for computing rapid hash on StateValue data to optimize equality checks
+pub const RAPID_HASH_THRESHOLD: usize = 32;
+
 #[cfg(any(test, feature = "fuzzing"))]
 impl Arbitrary for StateValue {
     type Parameters = ();
@@ -217,8 +238,17 @@ impl Serialize for StateValue {
 }
 
 impl StateValue {
+    /// Computes rapid hash if data is large enough, otherwise returns None
+    fn compute_rapid_hash(data: &Bytes) -> Option<(u64, usize)> {
+        (data.len() >= RAPID_HASH_THRESHOLD).then(|| (rapidhash(data), data.len()))
+    }
+
     fn to_persistable_form(&self) -> PersistedStateValue {
-        let Self { data, metadata } = self.clone();
+        let Self {
+            data,
+            metadata,
+            maybe_rapid_hash: _,
+        } = self.clone();
         let metadata = metadata.into_persistable();
         match metadata {
             None => PersistedStateValue::V0(data),
@@ -231,7 +261,12 @@ impl StateValue {
     }
 
     pub fn new_with_metadata(data: Bytes, metadata: StateValueMetadata) -> Self {
-        Self { data, metadata }
+        let maybe_rapid_hash = Self::compute_rapid_hash(&data);
+        Self {
+            data,
+            metadata,
+            maybe_rapid_hash,
+        }
     }
 
     pub fn size(&self) -> usize {
@@ -249,6 +284,7 @@ impl StateValue {
         f: F,
     ) -> anyhow::Result<StateValue> {
         self.data = f(self.data)?;
+        self.maybe_rapid_hash = Self::compute_rapid_hash(&self.data);
         Ok(self)
     }
 
@@ -258,6 +294,7 @@ impl StateValue {
 
     pub fn set_bytes(&mut self, data: Bytes) {
         self.data = data;
+        self.maybe_rapid_hash = Self::compute_rapid_hash(&self.data);
     }
 
     pub fn metadata(&self) -> &StateValueMetadata {
@@ -273,7 +310,11 @@ impl StateValue {
     }
 
     pub fn unpack(self) -> (StateValueMetadata, Bytes) {
-        let Self { data, metadata } = self;
+        let Self {
+            data,
+            metadata,
+            maybe_rapid_hash: _,
+        } = self;
 
         (metadata, data)
     }