diff --git a/broker/src/main/java/org/apache/rocketmq/broker/BrokerController.java b/broker/src/main/java/org/apache/rocketmq/broker/BrokerController.java index 8e2954d8ff0..957d8696fb9 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/BrokerController.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/BrokerController.java @@ -1027,6 +1027,17 @@ private void reloadServerSslContext() { return result; } + /** + * Register pre-put hooks and the send-message-back hook into the message + * store. The hooks are executed in order before every {@code putMessage}: + *
    + *
  1. {@code checkBeforePutMessage} — validate topic, body, and queue
  2. + *
  3. {@code innerBatchChecker} — process inner-batch envelope messages
  4. + *
  5. {@code handleScheduleMessage} — convert delayed messages into + * timer-queue entries
  6. + *
  7. {@code handleLmqQuota} — enforce light message queue quota
  8. + *
+ */ public void registerMessageStoreHook() { List putMessageHookList = messageStore.getPutMessageHookList(); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/longpolling/PopLongPollingService.java b/broker/src/main/java/org/apache/rocketmq/broker/longpolling/PopLongPollingService.java index c595178d193..134887f9ad4 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/longpolling/PopLongPollingService.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/longpolling/PopLongPollingService.java @@ -52,6 +52,24 @@ import static org.apache.rocketmq.broker.longpolling.PollingResult.POLLING_SUC; import static org.apache.rocketmq.broker.longpolling.PollingResult.POLLING_TIMEOUT; +/** + * Pop-mode long polling service that suspends Pop requests and wakes them up when new messages arrive. + *

+ * Core responsibilities: + *

+ */ public class PopLongPollingService extends ServiceThread { private static final Logger POP_LOGGER = diff --git a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerCache.java b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerCache.java index c74c5793a5c..94a69135d81 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerCache.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerCache.java @@ -35,6 +35,41 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * In-memory cache for un-acked Pop consumer records, used when + * {@code enablePopBufferMerge} is enabled in the KVStore path. + * + *

+ * The cache structure is as follows: { + * groupId@topicId@queueId: { + * active: ConcurrentSkipListMap, + * removed: ConcurrentSkipListMap + * } + * } + * active(recordTreeMap): in-flight records + * removed(removedTreeMap): records to be removed + *

+ * + *

Popped messages are stored here by + * {@link PopConsumerService#popAsync}. The background {@link #run()} thread + * periodically scans the cache and processes expired records: + *

+ * + *

Each {@code groupId@topicId@queueId} entry is backed by a + * {@link ConsumerRecords} instance containing two + * {@link ConcurrentSkipListMap}s — one for active records and one for + * records staged for removal. + */ public class PopConsumerCache extends ServiceThread { private static final Logger log = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LOGGER_NAME); @@ -46,6 +81,14 @@ public class PopConsumerCache extends ServiceThread { private final Consumer reviveConsumer; private final AtomicInteger estimateCacheSize; + /** + * Maps {@code consumerGroupId@topicId@queueId} to the buffered records for that + * consumer-queue. + * + *

Used by {@link #writeRecords} to add popped messages, + * {@link #deleteRecords} to remove acked messages, and + * {@link #cleanupRecords} to process expired records. + */ private final ConcurrentMap consumerRecordTable; public PopConsumerCache(BrokerController brokerController, PopConsumerKVStore consumerRecordStore, @@ -89,9 +132,20 @@ public long getPopInFlightMessageCount(String groupId, String topicId, int queue return consumerRecords != null ? consumerRecords.getInFlightRecordCount() : 0L; } + /** + * Write popped records into the cache. + * + *

Each record is inserted into the {@link ConsumerRecords} for its + * {@code groupId@topicId@queueId}. If no entry exists for that key, a + * new one is created. The cache size estimate is incremented. + * + * @param consumerRecordList the popped records to cache + */ public void writeRecords(List consumerRecordList) { this.estimateCacheSize.addAndGet(consumerRecordList.size()); consumerRecordList.forEach(consumerRecord -> { + // consumerRecords is the recordMap in cache + // it contains two maps of PopConsumerRecord ConsumerRecords consumerRecords = ConcurrentHashMapUtils.computeIfAbsent(consumerRecordTable, this.getKey(consumerRecord), k -> new ConsumerRecords(brokerController.getBrokerConfig(), consumerRecord.getGroupId(), consumerRecord.getTopicId(), consumerRecord.getQueueId())); @@ -205,13 +259,46 @@ public void run() { } } + /** + * Records for one {@code consumerGroupId@topicId@queueId} in the Pop cache. + * + *

Uses two {@link ConcurrentSkipListMap}s to separate active and + * expiring records for safe two-phase cleanup: + *

    + *
  1. {@link #stageExpiredRecords} moves timed-out records from + * {@link #recordTreeMap} to {@link #removeTreeMap}
  2. + *
  3. {@link PopConsumerCache#cleanupRecords} drains + * {@link #removeTreeMap} — true-expired records are revived, + * approaching-expired records are written to the KVStore
  4. + *
+ */ protected static class ConsumerRecords { private final String groupId; private final String topicId; private final int queueId; private final BrokerConfig brokerConfig; + /** + * Staged records awaiting cleanup (revival or KVStore write). + * + *

Populated by {@link #stageExpiredRecords} and drained by + * {@link PopConsumerCache#cleanupRecords}. Sorted by offset + * so that {@link #getMinOffset} can include these records in + * the minimum offset computation. + */ private final ConcurrentSkipListMap removeTreeMap; + /** + * Active (in-flight) records that have been popped but not yet + * acked by the consumer. + * + *

Records are added via {@link #write} when messages are popped, + * removed via {@link #delete} when an ack arrives, and moved to + * {@link #removeTreeMap} via {@link #stageExpiredRecords} when + * the visibility timeout or stay-buffer time expires. + * + *

Sorted by offset for efficient minimum-offset queries + * ({@link #getMinOffset}). + */ private final ConcurrentSkipListMap recordTreeMap; public ConsumerRecords(BrokerConfig brokerConfig, String groupId, String topicId, int queueId) { diff --git a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerKVStore.java b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerKVStore.java index 33072d699b5..fc2c9739a14 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerKVStore.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerKVStore.java @@ -18,6 +18,22 @@ import java.util.List; +/** + * Persistent key-value store for un-acked Pop consumer records. + * + *

Used by the KVStore-based ack path ({@code popConsumerKVServiceEnable=true}). + * When a message is popped, a record is written here. When the consumer acks + * the message or the visibility timeout expires, the record is deleted or + * revived. The default implementation is {@code PopConsumerRocksdbStore}. + * + *

This interface supports three operations: + *

+ */ public interface PopConsumerKVStore { /** diff --git a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerRecord.java b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerRecord.java index d10b584ef69..73c85311614 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerRecord.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerRecord.java @@ -58,6 +58,15 @@ public int getCode() { @JSONField(ordinal = 4) private int retryFlag; + /** + * Message visibility timeout in milliseconds. + * + *

The visibility timeout ({@code popTime + invisibleTime}) determines when + * a popped-but-unacked message becomes eligible for revival. Set by the + * consumer (default 60s via {@code DefaultMQPushConsumer#setPopInvisibleTime}). + * Can be changed by proxy with config. + * Can be extended via {@code ChangeInvisibleTime}. + */ @JSONField(ordinal = 5) private long invisibleTime; @@ -67,9 +76,31 @@ public int getCode() { @JSONField(ordinal = 7) private int attemptTimes; + /** + * Client-generated idempotency key for FIFO ordered consumption. + * + *

Possible values: + *

+ */ @JSONField(ordinal = 8) private String attemptId; + /** + * Whether the consumer has suspended (nacked) this message. + * + *

When {@code true}, the reconsume count is not incremented on + * revive, so the message will not be prematurely sent to the DLQ due to + * repeated visibility timeout extensions. Set via + * {@code ChangeInvisibleTimeRequestHeader#isSuspend}. + */ @JSONField(ordinal = 9) private boolean suspend; @@ -102,7 +133,19 @@ public long getVisibilityTimeout() { } /** - * Key: timestamp(8) + groupId + topicId + queueId + offset + * Build the RocksDB key for this record. + * + *

Format: + *

+     * visibilityTimeout(8B) + groupId + '@' + topicId + '@' + queueId(4B) + '@' + offset(8B)
+     * 
+ * + *

The {@code visibilityTimeout} is placed first so that records are ordered + * by expiration time in RocksDB's SST files. This allows + * {@code PopConsumerRocksdbStore#scanExpiredRecords} to use a bounded iterator + * to scan only the relevant time window without a full table scan. + * + *

NACK(changeInvisibleTime) will create a new record, and the old one will be deleted. */ @JSONField(serialize = false) public byte[] getKeyBytes() { diff --git a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerRocksdbStore.java b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerRocksdbStore.java index dc68f9d9fe5..fcd5826853e 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerRocksdbStore.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerRocksdbStore.java @@ -38,6 +38,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * RocksDB-backed implementation of {@link PopConsumerKVStore} for the + * KVStore-based Pop ack path. + * + *

Stores Pop consumer records in a dedicated {@code "popState"} column + * family. Each record is keyed by {@code visibilityTimeout|groupId@topicId@queueId@offset} + * so that {@link #scanExpiredRecords} can efficiently scan only expired + * records within a time window without a full table scan. + * + *

Write and delete operations use synchronous flush and WAL for + * durability — Pop visibility state is the sole source of truth in the + * KVStore path and must survive crashes. + */ public class PopConsumerRocksdbStore extends AbstractRocksDBStorage implements PopConsumerKVStore { private static final Logger log = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LOGGER_NAME); @@ -55,31 +68,71 @@ public PopConsumerRocksdbStore(String filePath, long blockCacheSize, long writeB this.writeBufferSize = writeBufferSize; } - // https://www.cnblogs.com/renjc/p/rocksdb-class-db.html - // https://github.com/johnzeng/rocksdb-doc-cn/blob/master/doc/RocksDB-Tuning-Guide.md + /** + * Configure RocksDB options for Pop consumer record storage. + * + *

Unlike the parent class defaults, write and delete options enable + * WAL and synchronous flush — Pop visibility state is the sole source + * of truth and must survive crashes. Compaction is configured to be + * aggressive so that expired-then-deleted records are purged promptly, + * reclaiming disk space. + * + * @see rocksdb-class-db + * @see RocksDB-Tuning-Guide + */ protected void initOptions() { + // durability-first: enable WAL and sync flush for pop state recovery this.options = RocksDBOptionsFactory.createDBOptions(); this.writeOptions = new WriteOptions(); + // fsync every write to disk this.writeOptions.setSync(true); + // enable WAL this.writeOptions.setDisableWAL(false); + // allow writing throttling under pressure this.writeOptions.setNoSlowdown(false); + // delete must be durable too — otherwise ack can be lost and message revived incorrectly this.deleteOptions = new WriteOptions(); this.deleteOptions.setSync(true); this.deleteOptions.setDisableWAL(false); this.deleteOptions.setNoSlowdown(false); + // aggressive compaction to purge expired pop records and reclaim space this.compactRangeOptions = new CompactRangeOptions(); + // force compact bottom level this.compactRangeOptions.setBottommostLevelCompaction( CompactRangeOptions.BottommostLevelCompaction.kForce); + // allow compaction to pause writes this.compactRangeOptions.setAllowWriteStall(true); + // manual compaction runs in parallel with auto-compaction. + // Appropriate here because expired Pop records generate tombstones continuously, + // and cleanup should not starve RocksDB's normal background work this.compactRangeOptions.setExclusiveManualCompaction(false); + // Allows compaction to move data across levels this.compactRangeOptions.setChangeLevel(true); + // -1 delegates level selection to RocksDB's internal heuristics this.compactRangeOptions.setTargetLevel(-1); + // Splits the compaction work into at most 4 parallel sub-tasks this.compactRangeOptions.setMaxSubcompactions(4); } + /** + * Initialise the RocksDB instance with a dedicated column family for Pop state. + * + *

Two column families are created: + *

    + *
  1. {@code default} — unused, required by RocksDB
  2. + *
  3. {@code "popState"} — stores Pop consumer records keyed by + * {@code visibilityTimeout|groupId@topicId@queueId@offset}
  4. + *
+ * + *

Called by {@link AbstractRocksDBStorage#start()} before the storage + * is marked as loaded. Returns {@code false} if any step fails, preventing + * all subsequent read/write operations via {@link #hold()}. + * + * @return {@code true} if the database was opened successfully + */ @Override protected boolean postLoad() { try { @@ -111,6 +164,16 @@ public String getFilePath() { return this.dbPath; } + /** + * Batch-write consumer records to RocksDB via a single {@link WriteBatch}. + * Key: (popTime + invisibleTime) + groupId + topicId + queueId + offset + * value: PopConsumerRecord.toJsonBytes + * + *

Each record is serialized with its visibility-timeout-prefixed key + * so that {@link #scanExpiredRecords} can efficiently scan by time range. + * + * @param consumerRecordList the records to persist + */ @Override public void writeRecords(List consumerRecordList) { if (!consumerRecordList.isEmpty()) { @@ -125,6 +188,14 @@ public void writeRecords(List consumerRecordList) { } } + /** + * Batch-delete consumer records from RocksDB via a single {@link WriteBatch}. + * + *

Deletion uses the same durability guarantees as writes ({@code sync=true}, + * WAL enabled) + * + * @param consumerRecordList the records to remove + */ @Override public void deleteRecords(List consumerRecordList) { if (!consumerRecordList.isEmpty()) { @@ -139,8 +210,19 @@ public void deleteRecords(List consumerRecordList) { } } + /** + * Scan and return expired consumer records within a visibility-timeout range. + * + *

Because each record's key is prefixed with {@code visibilityTimeout}, + * this method uses a RocksDB iterator bounded by {@code [lower, upper)} to + * efficiently scan only the relevant time window without a full table scan. + * + * @param lower inclusive lower bound of the visibility timeout (ms) + * @param upper exclusive upper bound of the visibility timeout (ms) + * @param maxCount maximum number of records to return + * @return up to {@code maxCount} expired records, or an empty list + */ @Override - // https://github.com/facebook/rocksdb/issues/10300 public List scanExpiredRecords(long lower, long upper, int maxCount) { // In RocksDB, we can use SstPartitionerFixedPrefixFactory in cfOptions // and new ColumnFamilyOptions().useFixedLengthPrefixExtractor() to @@ -153,6 +235,7 @@ public List scanExpiredRecords(long lower, long upper, int ma RocksIterator iterator = db.newIterator(this.columnFamilyHandle, scanOptions)) { iterator.seek(ByteBuffer.allocate(Long.BYTES).putLong(lower).array()); while (iterator.isValid() && consumerRecordList.size() < maxCount) { + // decode json bytes to PopConsumerRecord consumerRecordList.add(PopConsumerRecord.decode(iterator.value())); iterator.next(); } diff --git a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerService.java b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerService.java index 9ab5eb651be..a1ed90cc339 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerService.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/pop/PopConsumerService.java @@ -109,6 +109,7 @@ public PopConsumerService(BrokerController brokerController) { } /** + * No external callers, only called by unit tests. * In-flight messages are those that have been received from a queue * by a consumer but have not yet been deleted. For standard queues, * there is a limit on the number of in-flight messages, depending on queue traffic and message backlog. @@ -119,6 +120,7 @@ public boolean isPopShouldStop(String group, String topic, int queueId) { brokerConfig.getPopInflightMessageThreshold(); } + // No external callers, only called by unit tests. public long getPendingFilterCount(String groupId, String topicId, int queueId) { try { long maxOffset = this.brokerController.getMessageStore().getMaxOffsetInQueue(topicId, queueId); @@ -129,6 +131,7 @@ public long getPendingFilterCount(String groupId, String topicId, int queueId) { } } + // No external callers, only called by unit tests. public GetMessageResult recodeRetryMessage(GetMessageResult getMessageResult, String topicId, long offset, long popTime, long invisibleTime) { @@ -168,6 +171,34 @@ public GetMessageResult recodeRetryMessage(GetMessageResult getMessageResult, return result; } + /** + * Merge a GetMessageResult into the pop context and commit the consumer offset. + * No external callers, only called by unit tests. + * + *

If messages were found: + *

+ * + *

The consumer offset is then committed: + *

+ * + * @param context the pop context to update + * @param result the result from the message store + * @param topicId topic name + * @param queueId queue id + * @param retryType whether this is a retry topic V1/V2 + * @param offset the original consume offset used for this fetch + * @return the updated pop context + */ public PopConsumerContext handleGetMessageResult(PopConsumerContext context, GetMessageResult result, String topicId, int queueId, PopConsumerRecord.RetryType retryType, long offset) { @@ -205,6 +236,29 @@ public PopConsumerContext handleGetMessageResult(PopConsumerContext context, Get return context; } + /** + * Retrieve the starting consume offset for a pop request. + * should be private, no external callers. + * + *

For FIFO consumers, the offset is read from the regular consumer offset. + * For non-FIFO consumers, a separate pull offset is used (compatibility with + * pull consumer switchover). + * + *

If no offset is stored (first pop), it is initialized via + * {@code PopMessageProcessor#getInitOffset} based on {@code initMode} + * (beginning or end of the queue). + * + *

If a reset offset exists (offset reset command issued), the cache is + * cleared, FIFO lock unlock, and the reset offset takes effect + * immediately. + * + * @param groupId consumer group id + * @param topicId topic name + * @param queueId queue id + * @param initMode consume init mode (min/max) + * @param fifo whether this is a FIFO ordered consumption + * @return the consume offset to start popping from + */ public long getPopOffset(String groupId, String topicId, int queueId, int initMode, boolean fifo) { // For FIFO messages, the pull offset is not used. @@ -213,6 +267,7 @@ public long getPopOffset(String groupId, String topicId, int queueId, int initMo this.brokerController.getConsumerOffsetManager().queryOffset(groupId, topicId, queueId) : this.brokerController.getConsumerOffsetManager().queryPullOffset(groupId, topicId, queueId); + // init offset if (offset < 0L) { try { offset = this.brokerController.getPopMessageProcessor() @@ -223,6 +278,8 @@ public long getPopOffset(String groupId, String topicId, int queueId, int initMo throw new RuntimeException(e); } } + + // get reset offset Long resetOffset = this.brokerController.getConsumerOffsetManager().queryThenEraseResetOffset(topicId, groupId, queueId); if (resetOffset != null) { @@ -231,9 +288,29 @@ public long getPopOffset(String groupId, String topicId, int queueId, int initMo this.brokerController.getConsumerOffsetManager() .commitOffset("ResetPopOffset", groupId, topicId, queueId, resetOffset); } + return resetOffset != null ? resetOffset : offset; } + /** + * Fetch messages from the store with automatic offset correction. + * No external callers, except unit tests. + * + *

If the stored offset is behind the actual consume queue offset + * ({@code OFFSET_TOO_SMALL}, {@code OFFSET_OVERFLOW_BADLY}, + * {@code OFFSET_FOUND_NULL}), the offset is corrected and a retry is + * issued with the corrected offset. This prevents duplicate messages + * when the Pop buffer offset has not yet been committed. + * + * @param clientHost the client address + * @param groupId consumer group id + * @param topicId topic name + * @param queueId queue id + * @param offset the consume offset to start from + * @param batchSize max number of messages + * @param filter message filter + * @return a future completing with the fetch result + */ public CompletableFuture getMessageAsync(String clientHost, String groupId, String topicId, int queueId, long offset, int batchSize, MessageFilter filter) { @@ -279,6 +356,7 @@ public CompletableFuture getMessageAsync(String clientHost, /** * Fifo message does not have retry feature in broker + * No external callers, only called by unit tests. */ public void setFifoBlocked(PopConsumerContext context, String groupId, String topicId, int queueId, List queueOffsetList, GetMessageResult getMessageResult) { @@ -287,6 +365,7 @@ public void setFifoBlocked(PopConsumerContext context, context.getPopTime(), context.getInvisibleTime(), queueOffsetList, context.getOrderCountInfoBuilder(), getMessageResult); } + // No external callers, only called by unit tests. public boolean isFifoBlocked(PopConsumerContext context, String groupId, String topicId, int queueId) { // If server-side reset offset is enabled, and there is a reset offset, // then return false to make sure that the reset offset takes effect. @@ -298,6 +377,32 @@ public boolean isFifoBlocked(PopConsumerContext context, String groupId, String context.getAttemptId(), topicId, groupId, queueId, context.getInvisibleTime()); } + /** + * Fetch messages from a single queue and append them to the pop context. + * No external callers, except unit tests. + * + *

Chained via {@link CompletableFuture#thenCompose} from + * {@link #getMessageFromTopicAsync}. When the batch is already full + * ({@code remain <= 0}), the pending count is added to the context and + * the chain stops. Otherwise, messages are fetched from the store and + * the result is merged into the context via {@link #handleGetMessageResult}. + * + *

Early termination can occur inside this method when: + *

+ * + * @param future the accumulator future carrying the pop context + * @param clientHost the client address + * @param groupId consumer group id + * @param topicId topic name + * @param queueId queue id + * @param batchSize max number of messages still needed + * @param filter message filter + * @param retryType whether this is a retry topic V1/V2 + * @return a future completing with the pop context updated with results + */ protected CompletableFuture getMessageAsync(CompletableFuture future, String clientHost, String groupId, String topicId, int queueId, int batchSize, MessageFilter filter, PopConsumerRecord.RetryType retryType) { @@ -335,13 +440,38 @@ protected CompletableFuture getMessageAsync(CompletableFutur }); } + /** + * Fetch messages from every read queue of a topic via a CompletableFuture chain. + * + *

Each queue is visited once. For each queue the + * {@link #getMessageAsync(CompletableFuture, String, String, String, int, int, MessageFilter, PopConsumerRecord.RetryType)} + * method is chained via {@link CompletableFuture#thenCompose}. The chain carries + * the accumulated result through all queues, stopping early when the batch is + * filled, the queue is blocked, or the inflight threshold is reached. + * + *

Queue iteration order respects {@code priorityOrderAsc} and uses + * {@code requestCount} as a round-robin offset for load balancing. + * + * @param future the accumulator future + * @param clientHost the client address + * @param groupId consumer group id + * @param topicId topic name + * @param requestCount round-robin counter for queue selection + * @param batchSize max number of messages to return + * @param filter message filter expression + * @param retryType whether this is a retry topic V1/V2 + * @return a future completing with the pop result context + */ protected CompletableFuture getMessageFromTopicAsync(CompletableFuture future, String clientHost, String groupId, String topicId, long requestCount, int batchSize, MessageFilter filter, PopConsumerRecord.RetryType retryType) { + // get topic config TopicConfig topicConfig = this.brokerController.getTopicConfigManager().selectTopicConfig(topicId); if (null == topicConfig) { return future; } + + // iterate all queues of the topic for (int i = 0; i < topicConfig.getReadQueueNums(); i++) { long index = (brokerController.getBrokerConfig().isPriorityOrderAsc() ? topicConfig.getReadQueueNums() - 1 - i : i) + requestCount; @@ -352,10 +482,38 @@ protected CompletableFuture getMessageFromTopicAsync(Complet return future; } + /** + * Asynchronously pop messages for the KVStore-based ack path. + * + *

This method coordinates the full Pop lifecycle: + *

    + *
  1. Validates topic, group, and acquires the consumer lock
  2. + *
  3. Determines whether to pull from retry topic first + * (based on {@code popFromRetryProbability})
  4. + *
  5. Pulls messages from normal topic (and retry topic V1/V2 if configured)
  6. + *
  7. Writes checkpoints to {@link PopConsumerCache} (buffer merge) or + * {@link PopConsumerKVStore} (RocksDB)
  8. + *
  9. Re-encodes retry messages if needed
  10. + *
+ * + * @param clientHost the client address + * @param popTime the pop invocation timestamp + * @param invisibleTime the message visibility timeout + * @param groupId consumer group id + * @param topicId topic name + * @param queueId queue id (-1 for all queues) + * @param batchSize max number of messages to return + * @param fifo whether this is a FIFO ordered consumption + * @param attemptId attempt id for idempotent consumption + * @param initMode consume init mode (min/max) + * @param filter message filter expression + * @return a future that completes with the pop result context + */ public CompletableFuture popAsync(String clientHost, long popTime, long invisibleTime, String groupId, String topicId, int queueId, int batchSize, boolean fifo, String attemptId, int initMode, MessageFilter filter) { + // init context params PopConsumerContext popConsumerContext = new PopConsumerContext(clientHost, popTime, invisibleTime, groupId, fifo, initMode, attemptId); @@ -391,18 +549,22 @@ public CompletableFuture popAsync(String clientHost, long po CompletableFuture.completedFuture(popConsumerContext); try { + // get message from retry topic, if (!fifo && preferRetry) { + // default config of retrieveMessageFromPopRetryTopicV1 is true, if (brokerConfig.isRetrieveMessageFromPopRetryTopicV1()) { getMessageFuture = this.getMessageFromTopicAsync(getMessageFuture, clientHost, groupId, retryTopicV1, requestCount, batchSize, filter, PopConsumerRecord.RetryType.RETRY_TOPIC_V1); } + // default config of enableRetryTopicV2 is false if (brokerConfig.isEnableRetryTopicV2()) { getMessageFuture = this.getMessageFromTopicAsync(getMessageFuture, clientHost, groupId, retryTopicV2, requestCount, batchSize, filter, PopConsumerRecord.RetryType.RETRY_TOPIC_V2); } } + // get message from normal topic if (queueId != -1) { getMessageFuture = this.getMessageAsync(getMessageFuture, clientHost, groupId, topicId, queueId, batchSize, filter, PopConsumerRecord.RetryType.NORMAL_TOPIC); @@ -410,6 +572,7 @@ public CompletableFuture popAsync(String clientHost, long po getMessageFuture = this.getMessageFromTopicAsync(getMessageFuture, clientHost, groupId, topicId, requestCount, batchSize, filter, PopConsumerRecord.RetryType.NORMAL_TOPIC); + // get message from retry topic if (!fifo && !preferRetry) { if (brokerConfig.isRetrieveMessageFromPopRetryTopicV1()) { getMessageFuture = this.getMessageFromTopicAsync(getMessageFuture, clientHost, groupId, @@ -425,6 +588,8 @@ public CompletableFuture popAsync(String clientHost, long po return getMessageFuture.thenCompose(result -> { if (result.isFound() && !result.isFifo()) { + // write checkpoint to cache or store + // default config of enablePopBufferMerge is false if (brokerConfig.isEnablePopBufferMerge() && popConsumerCache != null && !popConsumerCache.isCacheFull()) { this.popConsumerCache.writeRecords(result.getPopConsumerRecordList()); @@ -432,6 +597,7 @@ public CompletableFuture popAsync(String clientHost, long po this.popConsumerStore.writeRecords(result.getPopConsumerRecordList()); } + // format result for (int i = 0; i < result.getGetMessageResultList().size(); i++) { GetMessageResult getMessageResult = result.getGetMessageResultList().get(i); PopConsumerRecord popConsumerRecord = result.getPopConsumerRecordList().get(i); @@ -449,6 +615,7 @@ public CompletableFuture popAsync(String clientHost, long po } return CompletableFuture.completedFuture(result); }).whenComplete((result, throwable) -> { + // unlock by consumerLockService try { if (throwable != null) { log.error("PopConsumerService popAsync get message error", @@ -470,7 +637,28 @@ public CompletableFuture popAsync(String clientHost, long po return getMessageFuture; } - // Notify polling request when receive orderly ack + /** + * Delete the acked record from the cache and/or RocksDB store. + * + *

The deletion is a two-step fallback: + *

    + *
  • First, the record is deleted from {@link PopConsumerCache} (if buffer + * merge is enabled). If the record was present in the cache and removed + * successfully, the operation returns immediately without touching RocksDB
  • + *
  • If the cache is not enabled or the record was not found in the cache, + * deletion falls through to {@link PopConsumerKVStore#deleteRecords}
  • + *
+ * + *

memo: Notify polling request when receive orderly ack + * + * @param popTime the original pop time of the message + * @param invisibleTime the original visibility timeout + * @param groupId consumer group id + * @param topicId topic name + * @param queueId queue id + * @param offset the acked offset + * @return a future that completes with {@code true} on success + */ public CompletableFuture ackAsync( long popTime, long invisibleTime, String groupId, String topicId, int queueId, long offset) { @@ -492,7 +680,30 @@ public CompletableFuture ackAsync( return CompletableFuture.completedFuture(true); } - // refer ChangeInvisibleTimeProcessor.appendCheckPointThenAckOrigin + /** + * Extend the visibility timeout of a popped message (KVStore path). + * + *

refer: ChangeInvisibleTimeProcessor.appendCheckPointThenAckOrigin + * This is the KVStore equivalent of {@code ChangeInvisibleTimeProcessor#appendCheckPointThenAckOrigin}. + * + *

A new record with the updated timeout is written to the KVStore, and the + * old record (identified by the original {@code popTime + invisibleTime}) is + * deleted from the cache and KVStore. + * + *

If the new and old records have the same visibility timeout (e.g. the + * consumer extended by the same duration it already had), the delete one is + * skipped because the write one already overwrites the old record in RocksDB. + * + * @param popTime the original pop time + * @param invisibleTime the original visibility timeout + * @param changedPopTime the new pop time (typically current time) + * @param changedInvisibleTime the new visibility timeout + * @param groupId consumer group id + * @param topicId topic name + * @param queueId queue id + * @param offset the message offset + * @param suspend whether to suspend (nack without incrementing reconsume count) + */ public void changeInvisibilityDuration(long popTime, long invisibleTime, long changedPopTime, long changedInvisibleTime, String groupId, String topicId, int queueId, long offset, boolean suspend) { @@ -511,6 +722,7 @@ public void changeInvisibilityDuration(long popTime, long invisibleTime, long ch // No need to generate new records when the group does not exist, // because these retry messages will not be consumed by anyone. + // default value of popReviveSkipIfGroupAbsent is true boolean skipWrite = brokerConfig.isPopReviveSkipIfGroupAbsent() && !brokerController.getSubscriptionGroupManager().containsSubscriptionGroup(groupId); @@ -528,19 +740,41 @@ public void changeInvisibilityDuration(long popTime, long invisibleTime, long ch } // If the new CK has the same key as the old CK (same visibilityTimeout), - // the write already overwrites the old record in RocksDB, skip delete + // the write one already overwrites the old record in RocksDB, skip delete // to avoid removing the newly written record. if (skipWrite || ckRecord.getVisibilityTimeout() != ackRecord.getVisibilityTimeout()) { this.popConsumerStore.deleteRecords(Collections.singletonList(ackRecord)); } } + /** + * Read the original message from storage for revival. + * No external callers, except unit tests. + * + *

Used by {@link #revive(PopConsumerRecord)} when a visibility timeout + * expires. Delegates to {@link org.apache.rocketmq.broker.EscapeBridge} + * which can read from either the local store or a remote broker's store. + * + * @param consumerRecord the expired record + * @return a triple of (message, info, needRetry) + */ // Use broker escape bridge to support remote read public CompletableFuture> getMessageAsync(PopConsumerRecord consumerRecord) { return this.brokerController.getEscapeBridge().getMessageAsync(consumerRecord.getTopicId(), consumerRecord.getOffset(), consumerRecord.getQueueId(), brokerConfig.getBrokerName(), false); } + /** + * Revive a single expired record by re-publishing it to the retry topic. + * No external callers, only called by unit tests. + * + *

Skips the record if the consumer group no longer exists. + * Otherwise, reads the original message, + * and re-publishes it via {@link #reviveRetry}. + * + * @param record the expired record to revive + * @return a future completing with {@code true} on success + */ public CompletableFuture revive(PopConsumerRecord record) { if (brokerConfig.isPopReviveSkipIfGroupAbsent() && @@ -560,21 +794,48 @@ public CompletableFuture revive(PopConsumerRecord record) { log.info("PopConsumerService revive no need retry, record={}", record); return CompletableFuture.completedFuture(!result.getRight()); } + return CompletableFuture.completedFuture(this.reviveRetry(record, result.getLeft())); }); } + // No external callers, only called by unit tests. public void clearCache(String groupId, String topicId, int queueId) { if (popConsumerCache != null) { popConsumerCache.removeRecords(groupId, topicId, queueId); } } + /** + * Scan the KVStore for expired records and revive them. + * No external callers, only called by unit tests. + * + *

This is the core revival loop called by {@link #run()}: + *

    + *
  1. Scans {@link PopConsumerKVStore#scanExpiredRecords} for records + * whose visibility timeout falls within {@code [currentTime-3s, now)}
  2. + *
  3. For each expired record, calls {@link #revive(PopConsumerRecord)} to + * read the original message and re-publish it to the retry topic. + * Concurrency is controlled by a semaphore
  4. + *
  5. Failed revive attempts are retried with exponential backoff via a + * new record with increased {@code invisibleTime} and + * {@code attemptTimes}
  6. + *
  7. After the maximum retry attempts, the record is dropped
  8. + *
+ * + * @param currentTime tracks the last scanned visibility timeout (for incremental progress) + * @param maxCount maximum number of records to process per batch(load from config: 16 * 1024) + * @return the number of consumed (revived) records + */ public long revive(AtomicLong currentTime, int maxCount) { Stopwatch stopwatch = Stopwatch.createStarted(); long upperTime = System.currentTimeMillis() - 50L; + + // scan expired records between [currentTime-3s, now-50ms)] List consumerRecords = this.popConsumerStore.scanExpiredRecords( currentTime.get() - TimeUnit.SECONDS.toMillis(3), upperTime, maxCount); + + // init context params long scanCostTime = stopwatch.elapsed(TimeUnit.MILLISECONDS); // When reading messages from local storage, the current thread is used @@ -588,6 +849,7 @@ public long revive(AtomicLong currentTime, int maxCount) { // could merge read operation here for (PopConsumerRecord record : consumerRecords) { CompletableFuture future; + // revive record try { semaphore.acquire(); future = this.revive(record); @@ -595,6 +857,8 @@ public long revive(AtomicLong currentTime, int maxCount) { semaphore.release(); throw new RuntimeException(e); } + + // add future result to futureList futureList.add(future.thenAccept(result -> { if (!result) { if (record.getAttemptTimes() < brokerConfig.getPopReviveMaxAttemptTimes()) { @@ -614,9 +878,14 @@ public long revive(AtomicLong currentTime, int maxCount) { }).whenComplete((result, ex) -> semaphore.release())); } + // wait for all futures to complete CompletableFuture.allOf(futureList.toArray(new CompletableFuture[0])).join(); + + // then restore failure records and delete successful records this.popConsumerStore.writeRecords(new ArrayList<>(failureList)); this.popConsumerStore.deleteRecords(consumerRecords); + + // set currentTime and logging currentTime.set(consumerRecords.isEmpty() ? upperTime : consumerRecords.get(consumerRecords.size() - 1).getVisibilityTimeout()); @@ -636,6 +905,7 @@ public long revive(AtomicLong currentTime, int maxCount) { return consumerRecords.size(); } + // No external callers, only called by unit tests. public void createRetryTopicIfNeeded(String groupId, String retryTopic) { TopicConfig topicConfig = brokerController.getTopicConfigManager().selectTopicConfig(retryTopic); if (topicConfig != null && !brokerController.getBrokerConfig().isUseSeparateRetryQueue()) { @@ -668,6 +938,7 @@ public void createRetryTopicIfNeeded(String groupId, String retryTopic) { @SuppressWarnings("DuplicatedCode") // org.apache.rocketmq.broker.processor.PopReviveService#reviveRetry + // No external callers, only called by unit tests. public boolean reviveRetry(PopConsumerRecord record, MessageExt messageExt) { if (brokerConfig.isPopConsumerKVServiceLog()) { @@ -676,11 +947,13 @@ public boolean reviveRetry(PopConsumerRecord record, MessageExt messageExt) { record.getQueueId(), record.getOffset()); } + // create retry topic if needed boolean retry = StringUtils.startsWith(record.getTopicId(), MixAll.RETRY_GROUP_TOPIC_PREFIX); String retryTopic = retry ? record.getTopicId() : KeyBuilder.buildPopRetryTopic( record.getTopicId(), record.getGroupId(), brokerConfig.isEnableRetryTopicV2()); this.createRetryTopicIfNeeded(record.getGroupId(), retryTopic); + // create retry message // deep copy here MessageExtBrokerInner msgInner = new MessageExtBrokerInner(); msgInner.setTopic(retryTopic); @@ -745,6 +1018,7 @@ private int getRetryQueueId(String retryTopic, MessageExt oriMsg) { } // Export kv store record to revive topic + // admin service @SuppressWarnings("ExtractMethodRecommender") public synchronized void transferToFsStore() { Stopwatch stopwatch = Stopwatch.createStarted(); @@ -824,6 +1098,19 @@ public void shutdown() { } } + /** + * Background thread that periodically revives expired Pop records. + * + *

Each iteration: + *

    + *
  1. Calls {@link #revive(AtomicLong, int)} to scan the RocksDB store for + * records whose visibility timeout has elapsed, fetch the original + * message, and re-publish it to the retry topic
  2. + *
  3. Cleans up stale consumer locks every minute
  4. + *
  5. When the number of revived records is below the batch limit, sleeps + * for a short interval to avoid busy-waiting
  6. + *
+ */ @Override public void run() { this.consumerRunning.set(true); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/pop/orderly/QueueLevelConsumerManager.java b/broker/src/main/java/org/apache/rocketmq/broker/pop/orderly/QueueLevelConsumerManager.java index 6f496fa13b3..88ac91149b8 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/pop/orderly/QueueLevelConsumerManager.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/pop/orderly/QueueLevelConsumerManager.java @@ -84,7 +84,9 @@ protected void updateLockFreeTimestamp(String topic, String group, int queueId, /** * update the message list received + * called after message pop * + * @param attemptId attemptId * @param isRetry is retry topic or not * @param topic topic * @param group group @@ -97,6 +99,7 @@ protected void updateLockFreeTimestamp(String topic, String group, int queueId, public void update(String attemptId, boolean isRetry, String topic, String group, int queueId, long popTime, long invisibleTime, List msgQueueOffsetList, StringBuilder orderInfoBuilder) { + // init orderInfo map String key = buildKey(topic, group); ConcurrentHashMap qs = table.get(key); if (qs == null) { @@ -107,18 +110,20 @@ public void update(String attemptId, boolean isRetry, String topic, String group } } + // create or merge orderInfo OrderInfo orderInfo = qs.get(queueId); - if (orderInfo != null) { + if (orderInfo != null) { // merge order info OrderInfo newOrderInfo = new OrderInfo(attemptId, popTime, invisibleTime, msgQueueOffsetList, System.currentTimeMillis(), 0); newOrderInfo.mergeOffsetConsumedCount(orderInfo.attemptId, orderInfo.offsetList, orderInfo.offsetConsumedCount); orderInfo = newOrderInfo; - } else { + } else { // create order info orderInfo = new OrderInfo(attemptId, popTime, invisibleTime, msgQueueOffsetList, System.currentTimeMillis(), 0); } qs.put(queueId, orderInfo); + // calculate minConsumedTimes and build orderCountInfo Map offsetConsumedCount = orderInfo.offsetConsumedCount; int minConsumedTimes = Integer.MAX_VALUE; if (offsetConsumedCount != null) { @@ -151,6 +156,17 @@ public void update(String attemptId, boolean isRetry, String topic, String group update(attemptId, isRetry, topic, group, queueId, popTime, invisibleTime, msgQueueOffsetList, orderInfoBuilder); } + /** + * Check whether a new Pop request on this queue must be blocked due to an + * in-flight ordered-consumption lock. + * called before message pop + * + *

Looks up the {@link OrderInfo} for the given topic-group-queue triple. + * Delegates to {@link OrderInfo#needBlock} which returns {@code true} if + * any message in the current batch is still within its invisible window + * and has not yet been ACKed — meaning another consumer is already + * working on this queue's ordered batch. + */ @Override public boolean checkBlock(String attemptId, String topic, String group, int queueId, long invisibleTime) { String key = buildKey(topic, group); @@ -200,6 +216,7 @@ public void start() { /** * mark message is consumed finished. return the consumer offset + * called after message ack * * @param topic topic * @param group group @@ -400,6 +417,7 @@ public static class OrderInfo { private long popTime; /** * the invisibleTime when pop message + * it was set the first time when message pop */ @JSONField(name = "i") private Long invisibleTime; @@ -411,8 +429,12 @@ public static class OrderInfo { @JSONField(name = "o") private List offsetList; /** - * next visible timestamp for message - * key: message queue offset + * Per-message override of the next visible timestamp (epoch millis). + * Key: message queue offset. Value: timestamp when the message becomes + * visible again, set by {@code ChangeInvisibleTimeProcessor} to delay + * a specific message beyond the batch default {@link #invisibleTime}. + * Used by {@link #needBlock}, {@link #getLockFreeTimestamp}, and + * {@link #getMaxLockFreeTimestamp} when present. */ @JSONField(name = "ot") private Map offsetNextVisibleTime; @@ -527,22 +549,43 @@ public static List buildOffsetList(List queueOffsetList) { return simple; } - @JSONField(serialize = false, deserialize = false) - public boolean needBlock(String attemptId, long currentInvisibleTime) { + /** + * Determine whether a new Pop request must be blocked by this in-flight + * ordered batch. + * + *

Returns {@code true} (must block) only if all of the following hold: + *

    + *
  • The current attemptId is different from the in-flight one (same + * request retrying is not blocked)
  • + *
  • At least one message in the batch is not yet ACKed + * ({@link #isNotAck})
  • + *
  • That message's invisible window has not yet expired — either + * {@code popTime + invisibleTime} or its individually updated + * {@code offsetNextVisibleTime} is still in the future
  • + *
+ */ + @JSONField(serialize = false, deserialize = false) + public boolean needBlock(String attemptId, long currentInvisibleTime) { + // all offsets are not consumed, do not block if (offsetList == null || offsetList.isEmpty()) { return false; } + + // same request, do not block if (this.attemptId != null && this.attemptId.equals(attemptId)) { return false; } + int num = offsetList.size(); int i = 0; if (this.invisibleTime == null || this.invisibleTime <= 0) { this.invisibleTime = currentInvisibleTime; } long currentTime = System.currentTimeMillis(); + for (; i < num; i++) { if (isNotAck(i)) { + // calculate nextVisibleTime long nextVisibleTime = popTime + invisibleTime; if (offsetNextVisibleTime != null) { Long time = offsetNextVisibleTime.get(this.getQueueOffset(i)); @@ -550,10 +593,14 @@ public boolean needBlock(String attemptId, long currentInvisibleTime) { nextVisibleTime = time; } } + + // if offset is not expired, block if (currentTime < nextVisibleTime) { return true; } } + + // if acked, do nothing } return false; } @@ -671,6 +718,7 @@ public boolean isNotAck(int offsetIndex) { @JSONField(serialize = false, deserialize = false) public void mergeOffsetConsumedCount(String preAttemptId, List preOffsetList, Map prevOffsetConsumedCount) { + // validate input Map offsetConsumedCount = new HashMap<>(); if (prevOffsetConsumedCount == null) { prevOffsetConsumedCount = new HashMap<>(); @@ -679,16 +727,20 @@ public void mergeOffsetConsumedCount(String preAttemptId, List preOffsetLi this.offsetConsumedCount = prevOffsetConsumedCount; return; } + + // get pre offset set Set preQueueOffsetSet = new HashSet<>(); for (int i = 0; i < preOffsetList.size(); i++) { preQueueOffsetSet.add(getQueueOffset(preOffsetList, i)); } + + // merge offsetConsumedCount for (int i = 0; i < offsetList.size(); i++) { long queueOffset = this.getQueueOffset(i); - if (preQueueOffsetSet.contains(queueOffset)) { + if (preQueueOffsetSet.contains(queueOffset)) { // offset has be consumed int count = 1; Integer preCount = prevOffsetConsumedCount.get(queueOffset); - if (preCount != null) { + if (preCount != null) { // consumeTimes +1 count = preCount + 1; } offsetConsumedCount.put(queueOffset, count); @@ -711,4 +763,4 @@ public String toString() { .toString(); } } -} \ No newline at end of file +} diff --git a/broker/src/main/java/org/apache/rocketmq/broker/pop/orderly/QueueLevelConsumerOrderInfoLockManager.java b/broker/src/main/java/org/apache/rocketmq/broker/pop/orderly/QueueLevelConsumerOrderInfoLockManager.java index 7340e4beb55..22901732033 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/pop/orderly/QueueLevelConsumerOrderInfoLockManager.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/pop/orderly/QueueLevelConsumerOrderInfoLockManager.java @@ -33,6 +33,27 @@ import org.apache.rocketmq.logging.org.slf4j.Logger; import org.apache.rocketmq.logging.org.slf4j.LoggerFactory; +/** + * Schedules lock-release notifications for queue-level ordered Pop + * consumption. + * + *

When a consumer Pop s a batch of ordered messages from a queue, the + * queue is effectively "locked" until the consumer ACKs or the invisible + * time expires. This class uses a {@link HashedWheelTimer} to fire a + * notification at the predicted lock-free time, so that other consumers + * blocked in long-polling for the same queue can be woken up immediately + * rather than waiting for the polling timeout. + * + *

Two notification paths are supported: + *

    + *
  • {@link org.apache.rocketmq.broker.processor.PopMessageProcessor#notifyLongPollingRequestIfNeed} + * for regular topics
  • + *
  • {@code LiteEventDispatcher.dispatch} for Lite topics
  • + *
+ * + *

Functionally gated by + * {@code BrokerConfig#isEnableNotifyAfterPopOrderLockRelease}. + */ public class QueueLevelConsumerOrderInfoLockManager { private static final Logger POP_LOGGER = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LOGGER_NAME); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/processor/AckMessageProcessor.java b/broker/src/main/java/org/apache/rocketmq/broker/processor/AckMessageProcessor.java index 65f5f79aec4..3bf35343119 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/processor/AckMessageProcessor.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/processor/AckMessageProcessor.java @@ -26,6 +26,8 @@ import org.apache.rocketmq.broker.lite.LiteMetadataUtil; import org.apache.rocketmq.broker.offset.ConsumerOffsetManager; import org.apache.rocketmq.broker.pop.PopConsumerLockService; +import org.apache.rocketmq.broker.pop.PopConsumerService; +import org.apache.rocketmq.broker.pop.PopConsumerService; import org.apache.rocketmq.broker.pop.orderly.ConsumerOrderInfoManager; import org.apache.rocketmq.common.KeyBuilder; import org.apache.rocketmq.common.PopAckConstants; @@ -54,6 +56,27 @@ import org.apache.rocketmq.store.pop.AckMsg; import org.apache.rocketmq.store.pop.BatchAckMsg; +/** + * Processes consumer ack messages in Pop consumption mode. + * + *

Handles both single ({@link RequestCode#ACK_MESSAGE}) and batch + * ({@link RequestCode#BATCH_ACK_MESSAGE}) acks. Each ack is processed + * through one of two paths: + *

    + *
  • KVStore path ({@code popConsumerKVServiceEnable=true}) — + * delegates to {@link PopConsumerService#ackAsync}
  • + *
  • File-based path — tries {@link PopBufferMergeService#addAk} + * first; if the buffer merge is not available, writes the ack as a + * message to the system revive topic
  • + *
+ * + *

Orderly ack is handled separately by {@link #ackOrderly} / + * {@link #ackOrderlyNew}, which update the consumer order info and advance + * the consumer offset while notifying any long-polling waiters. + * + *

This class also owns and manages the {@link PopReviveService} instances + * for the file-based revive path. + */ public class AckMessageProcessor implements NettyRequestProcessor { private static final Logger POP_LOGGER = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LOGGER_NAME); @@ -116,13 +139,36 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, return this.processRequest(ctx.channel(), request, true); } + /** + * Process an ack request (single or batch). + * + *

Routes to one of two paths based on {@code popConsumerKVServiceEnable}: + *

    + *
  • {@code true} — {@link #appendAckNew} (KVStore path, delegates to + * {@link PopConsumerService#ackAsync})
  • + *
  • {@code false} — {@link #appendAck} (file-based path, tries + * {@link PopBufferMergeService#addAk} first, then writes to revive topic)
  • + *
+ * + *

Orderly acks ({@code rqId == POP_ORDER_REVIVE_QUEUE}) are handled by + * {@link #ackOrderly} / {@link #ackOrderlyNew} instead. + * + * @param channel the Netty channel of the requesting client + * @param request the incoming request + * @param brokerAllowSuspend whether the broker may suspend the request + * @return the response to send back to the client + * @throws RemotingCommandException if the request cannot be decoded + */ private RemotingCommand processRequest(final Channel channel, RemotingCommand request, boolean brokerAllowSuspend) throws RemotingCommandException { + // init context params AckMessageRequestHeader requestHeader; BatchAckMessageRequestBody reqBody = null; final RemotingCommand response = RemotingCommand.createResponseCommand(ResponseCode.SUCCESS, null); response.setOpaque(request.getOpaque()); + if (request.getCode() == RequestCode.ACK_MESSAGE) { + // decode and validate request requestHeader = (AckMessageRequestHeader) request.decodeCommandCustomHeader(AckMessageRequestHeader.class); TopicConfig topicConfig = this.brokerController.getTopicConfigManager().selectTopicConfig(requestHeader.getTopic()); @@ -147,6 +193,7 @@ private RemotingCommand processRequest(final Channel channel, RemotingCommand re return ackLiteResponse; } + // get and validate offset long minOffset = this.brokerController.getMessageStore().getMinOffsetInQueue(requestHeader.getTopic(), requestHeader.getQueueId()); long maxOffset; try { @@ -162,12 +209,15 @@ private RemotingCommand processRequest(final Channel channel, RemotingCommand re response.setRemark(errorInfo); return response; } + + // append ack, default mode is queue based merge, call appendAck if (brokerController.getBrokerConfig().isPopConsumerKVServiceEnable()) { appendAckNew(requestHeader, null, response, channel, null); } else { appendAck(requestHeader, null, response, channel, null); } } else if (request.getCode() == RequestCode.BATCH_ACK_MESSAGE) { + // decode and validate request if (request.getBody() != null) { reqBody = BatchAckMessageRequestBody.decode(request.getBody(), BatchAckMessageRequestBody.class); } @@ -175,7 +225,10 @@ private RemotingCommand processRequest(final Channel channel, RemotingCommand re response.setCode(ResponseCode.NO_MESSAGE); return response; } + + // process each ack for (BatchAck bAck : reqBody.getAcks()) { + // default value of popConsumerKVServiceEnable is false if (brokerController.getBrokerConfig().isPopConsumerKVServiceEnable()) { appendAckNew(null, bAck, response, channel, reqBody.getBrokerName()); } else { @@ -183,6 +236,7 @@ private RemotingCommand processRequest(final Channel channel, RemotingCommand re } } } else { + // unsupported request, logging and return POP_LOGGER.error("AckMessageProcessor failed to process RequestCode: {}, consumer: {} ", request.getCode(), RemotingHelper.parseChannelRemoteAddr(channel)); response.setCode(ResponseCode.MESSAGE_ILLEGAL); response.setRemark(String.format("AckMessageProcessor failed to process RequestCode: %d", request.getCode())); @@ -191,8 +245,31 @@ private RemotingCommand processRequest(final Channel channel, RemotingCommand re return response; } + /** + * Append an ack (single or batch) in the file-based path. + * + *

For single ack: parses the extra info from the request header, + * routes orderly acks to {@link #ackOrderly}, or creates a single {@link AckMsg}. + * + *

For batch ack: expands the {@link BitSet} from the + * {@link BatchAck} into individual offsets, routes orderly acks individually, + * and packs the remaining offsets into a {@link BatchAckMsg}. + * + *

The ack is first offered to {@link PopBufferMergeService#addAk}. + * If the buffer merge is not available, the ack is serialized as JSON and + * written to the revive topic with tag {@link PopAckConstants#ACK_TAG} + * or {@link PopAckConstants#BATCH_ACK_TAG}. + * + * @param requestHeader the single-ack request header (null for batch) + * @param batchAck the batch ack body (null for single) + * @param response the response to modify on error + * @param channel the Netty channel + * @param brokerName the broker name + * @throws RemotingCommandException if offset validation fails + */ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchAck batchAck, final RemotingCommand response, final Channel channel, String brokerName) throws RemotingCommandException { + // init context params String[] extraInfo; String consumeGroup, topic; int qId, rqId; @@ -200,8 +277,11 @@ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchA long popTime, invisibleTime; AckMsg ackMsg; int ackCount = 0; + + // ack orderly or set context params if (batchAck == null) { // single ack + // set context params extraInfo = ExtraInfoUtil.split(requestHeader.getExtraInfo()); brokerName = ExtraInfoUtil.getBrokerName(extraInfo); consumeGroup = requestHeader.getConsumerGroup(); @@ -213,15 +293,18 @@ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchA popTime = ExtraInfoUtil.getPopTime(extraInfo); invisibleTime = ExtraInfoUtil.getInvisibleTime(extraInfo); + // ack orderly if revive queue if (rqId == KeyBuilder.POP_ORDER_REVIVE_QUEUE) { ackOrderly(topic, consumeGroup, qId, ackOffset, popTime, invisibleTime, channel, response); return; } + // set ackMsg and ackCount ackMsg = new AckMsg(); ackCount = 1; } else { // batch ack + // set context params consumeGroup = batchAck.getConsumerGroup(); topic = ExtraInfoUtil.getRealTopic(batchAck.getTopic(), batchAck.getConsumerGroup(), batchAck.getRetry()); qId = batchAck.getQueueId(); @@ -231,6 +314,7 @@ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchA popTime = batchAck.getPopTime(); invisibleTime = batchAck.getInvisibleTime(); + // offset check long minOffset = this.brokerController.getMessageStore().getMinOffsetInQueue(topic, qId); long maxOffset; try { @@ -243,6 +327,7 @@ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchA return; } + // ack orderly or add offset to batchAckMsg BatchAckMsg batchAckMsg = new BatchAckMsg(); BitSet bitSet = batchAck.getBitSet(); for (int i = bitSet.nextSetBit(0); i >= 0; i = bitSet.nextSetBit(i + 1)) { @@ -259,10 +344,13 @@ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchA batchAckMsg.getAckOffsetList().add(offset); } } + + // skip if empty or is revive queue if (rqId == KeyBuilder.POP_ORDER_REVIVE_QUEUE || batchAckMsg.getAckOffsetList().isEmpty()) { return; } + // set ackMsg and ackCount ackMsg = batchAckMsg; ackCount = batchAckMsg.getAckOffsetList().size(); } @@ -270,6 +358,7 @@ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchA this.brokerController.getBrokerStatsManager().incBrokerAckNums(ackCount); this.brokerController.getBrokerStatsManager().incGroupAckNums(consumeGroup, topic, ackCount); + // set ackMsg ackMsg.setConsumerGroup(consumeGroup); ackMsg.setTopic(topic); ackMsg.setQueueId(qId); @@ -278,11 +367,13 @@ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchA ackMsg.setPopTime(popTime); ackMsg.setBrokerName(brokerName); + // add ackMsg if (this.brokerController.getPopMessageProcessor().getPopBufferMergeService().addAk(rqId, ackMsg)) { brokerController.getPopInflightMessageCounter().decrementInFlightMessageNum(topic, consumeGroup, popTime, qId, ackCount); return; } + // create revive message by ackMsg, if add ackMsg failed MessageExtBrokerInner msgInner = new MessageExtBrokerInner(); msgInner.setTopic(reviveTopic); msgInner.setBody(JSON.toJSONString(ackMsg).getBytes(StandardCharsets.UTF_8)); @@ -300,7 +391,9 @@ private void appendAck(final AckMessageRequestHeader requestHeader, final BatchA msgInner.setDeliverTimeMs(popTime + invisibleTime); msgInner.getProperties().put(MessageConst.PROPERTY_UNIQ_CLIENT_MESSAGE_ID_KEYIDX, PopMessageProcessor.genAckUniqueId(ackMsg)); msgInner.setPropertiesString(MessageDecoder.messageProperties2String(msgInner.getProperties())); - if (brokerController.getBrokerConfig().isAppendAckAsync()) { + + // store revive message + if (brokerController.getBrokerConfig().isAppendAckAsync()) { // default is false int finalAckCount = ackCount; this.brokerController.getEscapeBridge().asyncPutMessageToSpecificQueue(msgInner).thenAccept(putMessageResult -> { handlePutMessageResult(putMessageResult, ackMsg, topic, consumeGroup, popTime, qId, finalAckCount); @@ -320,6 +413,7 @@ private void appendAckNew(final AckMessageRequestHeader requestHeader, final Bat final RemotingCommand response, final Channel channel, String brokerName) throws RemotingCommandException { if (requestHeader != null && batchAck == null) { + // init context params String[] extraInfo = ExtraInfoUtil.split(requestHeader.getExtraInfo()); String groupId = requestHeader.getConsumerGroup(); String topicId = requestHeader.getTopic(); @@ -329,6 +423,7 @@ private void appendAckNew(final AckMessageRequestHeader requestHeader, final Bat long invisibleTime = ExtraInfoUtil.getInvisibleTime(extraInfo); int reviveQueueId = ExtraInfoUtil.getReviveQid(extraInfo); + if (reviveQueueId == KeyBuilder.POP_ORDER_REVIVE_QUEUE) { ackOrderlyNew(topicId, groupId, queueId, ackOffset, popTime, invisibleTime, channel, response); } else { @@ -339,6 +434,7 @@ private void appendAckNew(final AckMessageRequestHeader requestHeader, final Bat this.brokerController.getBrokerStatsManager().incBrokerAckNums(1); this.brokerController.getBrokerStatsManager().incGroupAckNums(groupId, topicId, 1); } else { + // init context params String groupId = batchAck.getConsumerGroup(); String topicId = ExtraInfoUtil.getRealTopic( batchAck.getTopic(), batchAck.getConsumerGroup(), batchAck.getRetry()); @@ -349,6 +445,7 @@ private void appendAckNew(final AckMessageRequestHeader requestHeader, final Bat long invisibleTime = batchAck.getInvisibleTime(); try { + // get minOffset and maxOffset long minOffset = this.brokerController.getMessageStore().getMinOffsetInQueue(topicId, queueId); long maxOffset = this.brokerController.getMessageStore().getMaxOffsetInQueue(topicId, queueId); if (minOffset == -1 || maxOffset == -1) { @@ -360,6 +457,7 @@ private void appendAckNew(final AckMessageRequestHeader requestHeader, final Bat // Maintain consistency with the old implementation code style BitSet bitSet = batchAck.getBitSet(); for (int i = bitSet.nextSetBit(0); i >= 0; i = bitSet.nextSetBit(i + 1)) { + // validate offset if (i == Integer.MAX_VALUE) { break; } @@ -396,23 +494,53 @@ private void handlePutMessageResult(PutMessageResult putMessageResult, AckMsg ac brokerController.getPopInflightMessageCounter().decrementInFlightMessageNum(topic, consumeGroup, popTime, qId, ackCount); } + /** + * Handle an ack for an ordered Pop message in the file-based path. + * + *

The flow is: + *

    + *
  1. Fast-reject if the ack offset is older than the committed + * consumer offset.
  2. + *
  3. Spin-lock on the per-queue key to serialize concurrent ACKs.
  4. + *
  5. Double-check the offset after acquiring the lock.
  6. + *
  7. Call {@code commitAndNext} to advance the {@link OrderInfo}'s + * commit bit and compute the next offset.
  8. + *
  9. If the next offset is valid and the queue is no longer blocked, + * persist the new consumer offset and notify the long-polling requester + * that a new message is available.
  10. + *
+ * + *

If {@code commitAndNext} returns {@code -1}, the response is set to + * {@code MESSAGE_ILLEGAL} since the offset was not found in the in-flight + * order batch. A return value of {@code -2} (popTime mismatch) is + * silently ignored — the batch has already been superseded. + */ protected void ackOrderly(String topic, String consumeGroup, int qId, long ackOffset, long popTime, long invisibleTime, Channel channel, RemotingCommand response) { + // check offset String lockKey = topic + PopAckConstants.SPLIT + consumeGroup + PopAckConstants.SPLIT + qId; long oldOffset = this.brokerController.getConsumerOffsetManager().queryOffset(consumeGroup, topic, qId); if (ackOffset < oldOffset) { return; } + + // lock queue while (!this.brokerController.getPopMessageProcessor().getQueueLockManager().tryLock(lockKey)) { } + try { + // double check offset with lock oldOffset = this.brokerController.getConsumerOffsetManager().queryOffset(consumeGroup, topic, qId); if (ackOffset < oldOffset) { return; } + + // release orderInfo lock long nextOffset = brokerController.getConsumerOrderInfoManager().commitAndNext( topic, consumeGroup, qId, ackOffset, popTime); + if (nextOffset > -1) { + // commit offset if (!this.brokerController.getConsumerOffsetManager().hasOffsetReset(topic, consumeGroup, qId)) { this.brokerController.getConsumerOffsetManager().commitOffset( channel.remoteAddress().toString(), consumeGroup, topic, qId, nextOffset); @@ -420,7 +548,7 @@ protected void ackOrderly(String topic, String consumeGroup, int qId, long ackOf if (!this.brokerController.getConsumerOrderInfoManager().checkBlock(null, topic, consumeGroup, qId, invisibleTime)) { this.brokerController.getPopMessageProcessor().notifyMessageArriving(topic, qId, consumeGroup); } - } else if (nextOffset == -1) { + } else if (nextOffset == -1) { // return error String errorInfo = String.format("offset is illegal, key:%s, old:%d, commit:%d, next:%d, %s", lockKey, oldOffset, ackOffset, nextOffset, channel.remoteAddress()); POP_LOGGER.warn(errorInfo); @@ -428,12 +556,25 @@ protected void ackOrderly(String topic, String consumeGroup, int qId, long ackOf response.setRemark(errorInfo); return; } - } finally { + } finally { // unlock queue this.brokerController.getPopMessageProcessor().getQueueLockManager().unLock(lockKey); } brokerController.getPopInflightMessageCounter().decrementInFlightMessageNum(topic, consumeGroup, popTime, qId, 1); } + /** + * Handle an ack for an ordered Pop message in the KVStore path. + * + *

Mirror of {@link #ackOrderly} but uses the {@link PopConsumerService} + * infrastructure: lock service is {@link PopConsumerLockService} keyed by + * {@code group@topic} (coarser than the per-queue lock in the file-based + * path), and the result of {@code commitAndNext} may be logged when + * {@code popConsumerKVServiceLog} is enabled. + * + *

Behavior is otherwise identical: fast-reject, spin-lock, double-check, + * advance {@code OrderInfo} commit bit, persist offset, and notify the + * long-polling requester when the queue is no longer blocked. + */ protected void ackOrderlyNew(String topic, String consumeGroup, int qId, long ackOffset, long popTime, long invisibleTime, Channel channel, RemotingCommand response) { diff --git a/broker/src/main/java/org/apache/rocketmq/broker/processor/ChangeInvisibleTimeProcessor.java b/broker/src/main/java/org/apache/rocketmq/broker/processor/ChangeInvisibleTimeProcessor.java index 02deeb18a7a..2001cb62e3e 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/processor/ChangeInvisibleTimeProcessor.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/processor/ChangeInvisibleTimeProcessor.java @@ -27,6 +27,7 @@ import org.apache.rocketmq.broker.offset.ConsumerOffsetManager; import org.apache.rocketmq.broker.offset.MemoryConsumerOrderInfoManager; import org.apache.rocketmq.broker.pop.PopConsumerLockService; +import org.apache.rocketmq.broker.pop.PopConsumerService; import org.apache.rocketmq.broker.pop.orderly.ConsumerOrderInfoManager; import org.apache.rocketmq.common.PopAckConstants; import org.apache.rocketmq.common.TopicConfig; @@ -52,6 +53,23 @@ import org.apache.rocketmq.store.pop.AckMsg; import org.apache.rocketmq.store.pop.PopCheckPoint; +/** + * Processes the nack {@code ChangeInvisibleTime} request from consumers. + * + *

When a consumer needs more time to process a message (or wants to + * suspend/nack it), this processor updates the message's visibility + * timeout. The implementation varies by the ack mode: + *

    + *
  • KVStore path — delegates to + * {@link PopConsumerService#changeInvisibilityDuration}
  • + *
  • File-based path — writes a new CK to the revive topic with + * the updated invisible time, then acks the original CK so that + * the message will not be revived until the new timeout expires
  • + *
+ * + *

For orderly consumption, the next visible time is updated directly in + * the {@link ConsumerOrderInfoManager} without writing to the revive topic. + */ public class ChangeInvisibleTimeProcessor implements NettyRequestProcessor { private static final Logger POP_LOGGER = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LOGGER_NAME); private final BrokerController brokerController; @@ -71,8 +89,12 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, private RemotingCommand processRequest(final Channel channel, RemotingCommand request, boolean brokerAllowSuspend) throws RemotingCommandException { + // process request async CompletableFuture responseFuture = processRequestAsync(channel, request, brokerAllowSuspend); + // process response sync or a sync + // default value of appendCkAsync is false + // default value of appendAckAsync is false if (brokerController.getBrokerConfig().isAppendCkAsync() && brokerController.getBrokerConfig().isAppendAckAsync()) { responseFuture.thenAccept(response -> doResponse(channel, request, response)).exceptionally(throwable -> { RemotingCommand response = RemotingCommand.createResponseCommand(ChangeInvisibleTimeResponseHeader.class); @@ -97,8 +119,27 @@ private RemotingCommand processRequest(final Channel channel, RemotingCommand re return null; } + /** + * Asynchronously process a ChangeInvisibleTime request. + * + *

Routes to the appropriate handler based on message type: + *

    + *
  • Lite message — {@link #processChangeInvisibleTimeForLite}
  • + *
  • KVStore path + orderly — {@link #processChangeInvisibleTimeForOrderNew}
  • + *
  • KVStore path + non-orderly — {@link PopConsumerService#changeInvisibilityDuration}
  • + *
  • File-based path + orderly — {@link #processChangeInvisibleTimeForOrder}
  • + *
  • File-based path + non-orderly — {@link #appendCheckPointThenAckOrigin}
  • + *
+ * + * @param channel the Netty channel + * @param request the incoming request + * @param brokerAllowSuspend whether the broker may suspend + * @return a future that completes with the response + * @throws RemotingCommandException if the request cannot be decoded + */ public CompletableFuture processRequestAsync(final Channel channel, RemotingCommand request, boolean brokerAllowSuspend) throws RemotingCommandException { + // decode and validate request final ChangeInvisibleTimeRequestHeader requestHeader = (ChangeInvisibleTimeRequestHeader) request.decodeCommandCustomHeader(ChangeInvisibleTimeRequestHeader.class); RemotingCommand response = RemotingCommand.createResponseCommand(ChangeInvisibleTimeResponseHeader.class); response.setCode(ResponseCode.SUCCESS); @@ -121,11 +162,13 @@ public CompletableFuture processRequestAsync(final Channel chan return CompletableFuture.completedFuture(response); } + // lite topic process CompletableFuture future = processChangeInvisibleTimeForLite(requestHeader, response, responseHeader); if (future != null) { return future; } + // offset check long minOffset = this.brokerController.getMessageStore().getMinOffsetInQueue(requestHeader.getTopic(), requestHeader.getQueueId()); long maxOffset; try { @@ -139,6 +182,9 @@ public CompletableFuture processRequestAsync(final Channel chan } String[] extraInfo = ExtraInfoUtil.split(requestHeader.getExtraInfo()); + + // default value of popConsumerKVServiceEnable is false + // kv based ack service if (brokerController.getBrokerConfig().isPopConsumerKVServiceEnable()) { if (ExtraInfoUtil.isOrder(extraInfo)) { return this.processChangeInvisibleTimeForOrderNew( @@ -159,16 +205,20 @@ public CompletableFuture processRequestAsync(final Channel chan return CompletableFuture.completedFuture(response); } + // file merge based ack service + + // orderly topic if (ExtraInfoUtil.isOrder(extraInfo)) { return CompletableFuture.completedFuture( processChangeInvisibleTimeForOrder(requestHeader, extraInfo, response, responseHeader)); } - // add new ck + // add new checkpoint then ack origin checkpoint long now = System.currentTimeMillis(); CompletableFuture futureResult = appendCheckPointThenAckOrigin(requestHeader, ExtraInfoUtil.getReviveQid(extraInfo), requestHeader.getQueueId(), requestHeader.getOffset(), now, extraInfo); + // format response return futureResult.thenCompose(result -> { if (result) { responseHeader.setInvisibleTime(requestHeader.getInvisibleTime()); @@ -255,8 +305,25 @@ protected RemotingCommand processChangeInvisibleTimeForOrder(ChangeInvisibleTime return response; } + /** + * Ack the original checkpoint after created a new checkpoint successfully. + * + *

Called after the new checkpoint has been written successfully. This method + * writes an {@link PopAckConstants#ACK_TAG} message that matches the + * original checkpoint's merge key. When {@link PopReviveService} processes this + * ack, it sets the corresponding bit in the old CK's bitMap, causing + * the old CK to be treated as fully acked and skipped during revive. + * + *

If {@link PopBufferMergeService#addAk} accepts the ack (buffer + * merge enabled), it is merged in memory without writing to the store. + * + * @param requestHeader the original request header + * @param extraInfo the extra info from the original pop request + * @return a future that completes with {@code true} on success + */ private CompletableFuture ackOrigin(final ChangeInvisibleTimeRequestHeader requestHeader, String[] extraInfo) { + // create ackMsg and related message MessageExtBrokerInner msgInner = new MessageExtBrokerInner(); AckMsg ackMsg = new AckMsg(); @@ -273,10 +340,12 @@ private CompletableFuture ackOrigin(final ChangeInvisibleTimeRequestHea this.brokerController.getBrokerStatsManager().incBrokerAckNums(1); this.brokerController.getBrokerStatsManager().incGroupAckNums(requestHeader.getConsumerGroup(), requestHeader.getTopic(), 1); + // add ackMsg if (brokerController.getPopMessageProcessor().getPopBufferMergeService().addAk(rqId, ackMsg)) { return CompletableFuture.completedFuture(true); } + // init message msgInner.setTopic(reviveTopic); msgInner.setBody(JSON.toJSONString(ackMsg).getBytes(StandardCharsets.UTF_8)); msgInner.setQueueId(rqId); @@ -287,6 +356,8 @@ private CompletableFuture ackOrigin(final ChangeInvisibleTimeRequestHea msgInner.setDeliverTimeMs(ExtraInfoUtil.getPopTime(extraInfo) + ExtraInfoUtil.getInvisibleTime(extraInfo)); msgInner.getProperties().put(MessageConst.PROPERTY_UNIQ_CLIENT_MESSAGE_ID_KEYIDX, PopMessageProcessor.genAckUniqueId(ackMsg)); msgInner.setPropertiesString(MessageDecoder.messageProperties2String(msgInner.getProperties())); + + // store message return this.brokerController.getEscapeBridge().asyncPutMessageToSpecificQueue(msgInner).thenCompose(putMessageResult -> { if (putMessageResult.getPutMessageStatus() != PutMessageStatus.PUT_OK && putMessageResult.getPutMessageStatus() != PutMessageStatus.FLUSH_DISK_TIMEOUT @@ -302,6 +373,27 @@ private CompletableFuture ackOrigin(final ChangeInvisibleTimeRequestHea }); } + /** + * Extend the visibility timeout by writing a new checkpoint and ack the old one. + * + *

This is the core of the file-based non-orderly ChangeInvisibleTime path: + *

    + *
  1. Writes a new CK ({@link PopAckConstants#CK_TAG}) to the revive + * topic with the updated {@code invisibleTime}. This CK will trigger a + * revive at the new timeout if not acked.
  2. + *
  3. If the CK is stored successfully, calls {@link #ackOrigin} to write + * an Ack ({@link PopAckConstants#ACK_TAG}) for the original CK, + * preventing the old CK from triggering a premature revive.
  4. + *
+ * + * @param requestHeader the original request header + * @param reviveQid the revive queue to write to + * @param queueId the original queue id + * @param offset the message offset being extended + * @param popTime the new pop time (current time) + * @param extraInfo the extra info from the original pop request + * @return a future that completes with {@code true} on success + */ private CompletableFuture appendCheckPointThenAckOrigin( final ChangeInvisibleTimeRequestHeader requestHeader, int reviveQid, @@ -309,6 +401,8 @@ private CompletableFuture appendCheckPointThenAckOrigin( // add check point msg to revive log MessageExtBrokerInner msgInner = new MessageExtBrokerInner(); msgInner.setTopic(reviveTopic); + + // create checkpoint PopCheckPoint ck = new PopCheckPoint(); ck.setBitMap(0); ck.setNum((byte) 1); @@ -322,6 +416,7 @@ private CompletableFuture appendCheckPointThenAckOrigin( ck.setBrokerName(ExtraInfoUtil.getBrokerName(extraInfo)); ck.setSuspend(requestHeader.isSuspend()); + // init message with checkpoint msgInner.setBody(JSON.toJSONString(ck).getBytes(StandardCharsets.UTF_8)); msgInner.setQueueId(reviveQid); msgInner.setTags(PopAckConstants.CK_TAG); @@ -331,6 +426,9 @@ private CompletableFuture appendCheckPointThenAckOrigin( msgInner.setDeliverTimeMs(ck.getReviveTime() - PopAckConstants.ackTimeInterval); msgInner.getProperties().put(MessageConst.PROPERTY_UNIQ_CLIENT_MESSAGE_ID_KEYIDX, PopMessageProcessor.genCkUniqueId(ck)); msgInner.setPropertiesString(MessageDecoder.messageProperties2String(msgInner.getProperties())); + + // store new checkpoint to extend invisible time + // then ack origin checkpoint return this.brokerController.getEscapeBridge().asyncPutMessageToSpecificQueue(msgInner).thenCompose(putMessageResult -> { if (brokerController.getBrokerConfig().isEnablePopLog()) { POP_LOGGER.info("change Invisible, appendCheckPoint, topic {}, queueId {},reviveId {}, cid {}, startOffset {}, rt {}, result {}", requestHeader.getTopic(), queueId, reviveQid, requestHeader.getConsumerGroup(), offset, @@ -344,6 +442,8 @@ private CompletableFuture appendCheckPointThenAckOrigin( this.brokerController.getBrokerStatsManager().incGroupCkNums(requestHeader.getConsumerGroup(), requestHeader.getTopic(), 1); } } + + // if success, ack origin checkpoint if (putMessageResult.getPutMessageStatus() != PutMessageStatus.PUT_OK && putMessageResult.getPutMessageStatus() != PutMessageStatus.FLUSH_DISK_TIMEOUT && putMessageResult.getPutMessageStatus() != PutMessageStatus.FLUSH_SLAVE_TIMEOUT diff --git a/broker/src/main/java/org/apache/rocketmq/broker/processor/EndTransactionProcessor.java b/broker/src/main/java/org/apache/rocketmq/broker/processor/EndTransactionProcessor.java index 2be2e188023..fa2ffd4eb14 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/processor/EndTransactionProcessor.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/processor/EndTransactionProcessor.java @@ -55,6 +55,34 @@ public EndTransactionProcessor(final BrokerController brokerController) { this.brokerController = brokerController; } + /** + * End a transaction (commit or rollback) for a prepared half message. + * + *

Two incoming paths: + *

    + *
  • Producer end ({@code fromTransactionCheck = false}) — + * the producer explicitly commits or rolls back after the + * half message was written
  • + *
  • Transaction check ({@code fromTransactionCheck = true}) — + * TransactionCheck call producer, then producer send this request
  • + *
+ * + *

For commit: + *

    + *
  1. Reads the prepared half message from the store
  2. + *
  3. Rejects if the message's immunity time has expired and this is + * not a checker callback
  4. + *
  5. Restores the original topic/queue from properties, writes the + * final message to the store, and deletes the half message
  6. + *
+ * + *

For rollback: deletes the prepared half message without writing. + * + * @param ctx the Netty channel context + * @param request the end-transaction request + * @return the response + * @throws RemotingCommandException if the request cannot be decoded + */ @Override public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand request) throws RemotingCommandException { @@ -68,6 +96,7 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand return response; } + // validate transaction flag and logging if (requestHeader.getFromTransactionCheck()) { switch (requestHeader.getCommitOrRollback()) { case MessageSysFlag.TRANSACTION_NOT_TYPE: { @@ -127,8 +156,10 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand return null; } } + OperationResult result = new OperationResult(); if (MessageSysFlag.TRANSACTION_COMMIT_TYPE == requestHeader.getCommitOrRollback()) { + // get prepare message from prepare topic result = this.brokerController.getTransactionalMessageService().commitMessage(requestHeader); if (result.getResponseCode() == ResponseCode.SUCCESS) { if (rejectCommitOrRollback(requestHeader, result.getPrepareMessage())) { @@ -137,6 +168,7 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand requestHeader.getMsgId(), requestHeader.getCommitLogOffset()); return response; } + RemotingCommand res = checkPrepareMessage(result.getPrepareMessage(), requestHeader); if (res.getCode() == ResponseCode.SUCCESS) { MessageExtBrokerInner msgInner = endMessageTransaction(result.getPrepareMessage()); @@ -145,8 +177,10 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand msgInner.setPreparedTransactionOffset(requestHeader.getCommitLogOffset()); msgInner.setStoreTimestamp(result.getPrepareMessage().getStoreTimestamp()); MessageAccessor.clearProperty(msgInner, MessageConst.PROPERTY_TRANSACTION_PREPARED); + // enqueue message to original topic RemotingCommand sendResult = sendFinalMessage(msgInner); if (sendResult.getCode() == ResponseCode.SUCCESS) { + // delete prepare message deletePrepareMessage(result); // successful committed, then total num of half-messages minus 1 this.brokerController.getTransactionalMessageService().getTransactionMetrics().addAndGet(msgInner.getTopic(), -1); @@ -164,6 +198,7 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand return res; } } else if (MessageSysFlag.TRANSACTION_ROLLBACK_TYPE == requestHeader.getCommitOrRollback()) { + // get prepare message from prepare topic result = this.brokerController.getTransactionalMessageService().rollbackMessage(requestHeader); if (result.getResponseCode() == ResponseCode.SUCCESS) { if (rejectCommitOrRollback(requestHeader, result.getPrepareMessage())) { @@ -172,8 +207,10 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand requestHeader.getMsgId(), requestHeader.getCommitLogOffset()); return response; } + RemotingCommand res = checkPrepareMessage(result.getPrepareMessage(), requestHeader); if (res.getCode() == ResponseCode.SUCCESS) { + // delete prepare message deletePrepareMessage(result); // roll back, then total num of half-messages minus 1 this.brokerController.getTransactionalMessageService().getTransactionMetrics().addAndGet(result.getPrepareMessage().getProperty(MessageConst.PROPERTY_REAL_TOPIC), -1); @@ -189,6 +226,21 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand return response; } + /** + * Delete a prepared (half) message after transaction commit or rollback. + * + *

Deletion strategy depends on the half-message storage: + *

    + *
  • {@code RMQ_SYS_TRANS_HALF_TOPIC} — writes an OP record to + * {@code RMQ_SYS_TRANS_OP_HALF_TOPIC} as a logical-delete marker; + * the transaction checker skips messages that have a matching OP
  • + *
  • {@code RMQ_SYS_ROCKSDB_TRANS_HALF_TOPIC} — physically deletes + * the message from RocksDB via + * {@code TransMessageRocksDBStore#deletePrepareMessage}
  • + *
+ * + * @param result the operation result containing the prepared message + */ private void deletePrepareMessage(OperationResult result) { if (null == result || null == result.getPrepareMessage()) { LOGGER.error("deletePrepareMessage param error, result is null or prepareMessage is null"); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/processor/LiteSubscriptionCtlProcessor.java b/broker/src/main/java/org/apache/rocketmq/broker/processor/LiteSubscriptionCtlProcessor.java index bcf0df41270..1a72f5a7b2c 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/processor/LiteSubscriptionCtlProcessor.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/processor/LiteSubscriptionCtlProcessor.java @@ -37,6 +37,26 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +/** + * Handles subscription control requests for Lite Topics, dispatched to + * {@link LiteSubscriptionRegistry} which manages client → topic → lmq set + * mappings. + * + *

Supports four actions on each {@link LiteSubscriptionDTO} entry: + *

    + *
  • {@code PARTIAL_ADD} — add specific lmq subscriptions without + * overwriting the existing complete subscription set
  • + *
  • {@code PARTIAL_REMOVE} — remove specific lmq subscriptions
  • + *
  • {@code COMPLETE_ADD} — replace the entire subscription set for the + * client with the provided lmq list
  • + *
  • {@code COMPLETE_REMOVE} — drop the entire subscription set for the + * client
  • + *
+ * + *

Quota and ACL errors surface as + * {@link ResponseCode#LITE_SUBSCRIPTION_QUOTA_EXCEEDED} and + * {@link ResponseCode#ILLEGAL_OPERATION} respectively. + */ public class LiteSubscriptionCtlProcessor implements NettyRequestProcessor { protected final Logger log = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LITE_LOGGER_NAME); @@ -48,6 +68,12 @@ public LiteSubscriptionCtlProcessor(BrokerController brokerController, LiteSubsc this.liteSubscriptionRegistry = liteSubscriptionRegistry; } + /** + * Process a batch of subscription control requests. Each entry is validated + * and dispatched to {@link LiteSubscriptionRegistry} according to its + * action. Blank fields cause the entry to be skipped with a warning rather + * than failing the whole batch. + */ @Override public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand request) throws Exception { if (request.getBody() == null) { diff --git a/broker/src/main/java/org/apache/rocketmq/broker/processor/PopBufferMergeService.java b/broker/src/main/java/org/apache/rocketmq/broker/processor/PopBufferMergeService.java index 5373eaea333..b17c1c7e410 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/processor/PopBufferMergeService.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/processor/PopBufferMergeService.java @@ -44,10 +44,56 @@ import java.util.concurrent.LinkedBlockingDeque; import java.util.concurrent.atomic.AtomicInteger; +/** + * File based Ack buffer merge service. + * + *

buffer checkpoint in memory then enqueue them into system revive queue then wait to be acked. + * + *

Two in-memory data structures drive the merge logic: + *

    + *
  • {@link #buffer} — maps {@code mergeKey} to {@link PopCheckPointWrapper}, + * tracking which sub-messages within a CK batch have been acked + * (via {@code bits} bitmask) and which have been persisted + * (via {@code toStoreBits} bitmask)
  • + *
  • {@link #commitOffsets} — maps {@code topic@cid@queueId} to an ordered + * queue of {@link PopCheckPointWrapper}s for sequential offset committing
  • + *
+ * + *

The background {@link #scan()} thread periodically evaluates each buffered CK: + *

    + *
  • All acks received — removes the CK from the buffer without writing + * anything to storage (clean completion)
  • + *
  • About to expire ({@code reviveTime - now < popCkStayBufferTimeOut}) + * or stayed too long — writes the CK and all un-persisted acks + * (or batch acks) to the revive topic
  • + *
+ * + *

This service is enabled by {@code enablePopBufferMerge} and only runs on + * a master or a slave acting as master. When {@code enablePopBatchAck} is set, + * multiple ack offsets are packed into a single {@link BatchAckMsg}. + */ public class PopBufferMergeService extends ServiceThread { private static final Logger POP_LOGGER = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LOGGER_NAME); + /** + * In-memory map of check points. + * Key: topic + group + queueId + startOffset + popTime + brokerName + * Value: check point wrapper + * use cases: + * - scan: iterate buffer + * - addAckMsg: get check point from buffer and mark ack state of Check Point + */ ConcurrentHashMap buffer = new ConcurrentHashMap<>(1024 * 16); + /** + * manage check point of given consumer and given queue + * Key: topic@cid@queueId + * Value: check point queue of specific consumer and queue + * use cases: + * - getLatestOffset: get consumer next start offset of given queue + * - scanGarbage + * - getOffsetTotalSize: get total popping num + * - isQueueFull + */ ConcurrentHashMap> commitOffsets = new ConcurrentHashMap<>(); private volatile boolean serving = true; @@ -92,6 +138,7 @@ public void run() { // scan while (!this.isStopped()) { try { + // env check if (!isShouldRunning()) { // slave this.waitForRunning(interval * 200 * 5); @@ -104,11 +151,12 @@ public void run() { scan(); if (scanTimes % countOfSecond30 == 0) { + // remove checkpoint which are timeout scanGarbage(); } + // waiting this.waitForRunning(interval); - if (!this.serving && this.buffer.size() == 0 && getOffsetTotalSize() == 0) { this.serving = true; } @@ -118,6 +166,7 @@ public void run() { } } + // scan until buffer is empty this.serving = false; try { Thread.sleep(2000); @@ -133,6 +182,27 @@ public void run() { } } + /** + * Drain the {@link #commitOffsets} queues and commit consumer offsets in FIFO order. + * scanAndCommitOffset may be a better name + * + *

For each {@code topic@cid@queueId} queue, the method peeks the head (oldest) + * wrapper and checks whether it is ready to commit: + *

    + *
  • Just-offset entry with CK stored
  • + *
  • All sub-messages acked ({@link #isCkDone})
  • + *
  • All acks persisted and CK stored ({@link #isCkDoneForFinish})
  • + *
+ * + *

If the head is ready, it is committed and removed. Processing continues + * to the next wrapper in the same queue. If the head is not ready, the loop + * breaks — this ensures strict FIFO order and prevents consumer offset + * regression. + * + *

Called at the end of {@link #scan()} after the buffer has been processed. + * + * @return the total number of remaining wrappers across all queues (for logging) + */ private int scanCommitOffset() { Iterator>> iterator = this.commitOffsets.entrySet().iterator(); int count = 0; @@ -185,9 +255,20 @@ public long getLatestOffset(String topic, String group, int queueId) { return getLatestOffset(KeyBuilder.buildPollingKey(topic, group, queueId)); } + /** + * Remove stale entries from {@link #commitOffsets}. + * + *

Three types of entries are removed: + *

    + *
  • Topic no longer exists (deleted)
  • + *
  • Consumer group no longer exists (unsubscribed)
  • + *
  • No activity for more than 5 minutes (idle)
  • + *
+ */ private void scanGarbage() { Iterator>> iterator = commitOffsets.entrySet().iterator(); while (iterator.hasNext()) { + // validate checkpoint Map.Entry> entry = iterator.next(); if (entry.getKey() == null) { continue; @@ -198,16 +279,23 @@ private void scanGarbage() { } String topic = keyArray[0]; String cid = keyArray[1]; + + // remove if topic no longer exists if (brokerController.getTopicConfigManager().selectTopicConfig(topic) == null) { POP_LOGGER.info("[PopBuffer]remove nonexistent topic {} in buffer!", topic); iterator.remove(); continue; } + + // remove if subscription group no longer exists if (!brokerController.getSubscriptionGroupManager().containsSubscriptionGroup(cid)) { POP_LOGGER.info("[PopBuffer]remove nonexistent subscription group {} of topic {} in buffer!", cid, topic); iterator.remove(); continue; } + + // remove if idle + // entry.getValue().getTime() = popTime of last checkpoint enqueued in the queue if (System.currentTimeMillis() - entry.getValue().getTime() > minute5) { POP_LOGGER.info("[PopBuffer]remove long time not used sub {} of topic {} in buffer!", cid, topic); iterator.remove(); @@ -223,6 +311,26 @@ private boolean isSubscriptionGroupNotExist(PopCheckPointWrapper pointWrapper) { } + /** + * Scan and process all buffered checkpoints, then drain the offset commit queue. + * + *

For each entry in {@link #buffer}: + *

    + *
  • Consumer group not found — removes the entry silently
  • + *
  • CK done (all sub-messages acked) — removes from buffer, no store write needed
  • + *
  • Just-offset entry — writes the CK to the revive topic if not yet stored
  • + *
  • Needs eviction (service stopped, revive timeout, or stay timeout) — + * writes the CK and all un-persisted acks (batch or individual) to the revive topic, + * then removes the entry when all persisted
  • + *
  • Otherwise — leaves the entry in the buffer for the next scan cycle
  • + *
+ * + *

After processing the buffer, calls {@link #scanCommitOffset()} to commit offsets + * for finished checkpoints in FIFO order. + * + *

If the scan duration exceeds {@code popCkStayBufferTimeOut - 1000ms}, the service + * temporarily stops accepting new CKs ({@link #serving} = false) to avoid backlog. + */ private void scan() { long startTime = System.currentTimeMillis(); AtomicInteger count = new AtomicInteger(0); @@ -244,7 +352,6 @@ private void scan() { continue; } - // just process offset(already stored at pull thread), or buffer ck(not stored and ack finish) if (pointWrapper.isJustOffset() && pointWrapper.isCkStored() || isCkDone(pointWrapper) || isCkDoneForFinish(pointWrapper) && pointWrapper.isCkStored()) { @@ -259,6 +366,7 @@ private void scan() { PopCheckPoint point = pointWrapper.getCk(); long now = System.currentTimeMillis(); + // check whether check point is timeout boolean removeCk = !this.serving; // ck will be timeout if (point.getReviveTime() - now < brokerController.getBrokerConfig().getPopCkStayBufferTimeOut()) { @@ -275,17 +383,18 @@ private void scan() { } // double check - if (isCkDone(pointWrapper)) { + if (isCkDone(pointWrapper)) { // all checkpoint are acked, do nothing continue; - } else if (pointWrapper.isJustOffset()) { + } else if (pointWrapper.isJustOffset()) { // store checkpoint // just offset should be in store. if (pointWrapper.getReviveQueueOffset() < 0) { putCkToStore(pointWrapper, this.brokerController.getBrokerConfig().isAppendCkAsync()); countCk++; } continue; - } else if (removeCk) { + } else if (removeCk) { // store checkpoint if needed // put buffer ak to store + // revive queue offset < 0 means checkpoint was not stored if (pointWrapper.getReviveQueueOffset() < 0) { putCkToStore(pointWrapper, this.brokerController.getBrokerConfig().isAppendCkAsync()); countCk++; @@ -295,11 +404,13 @@ private void scan() { continue; } - if (brokerController.getBrokerConfig().isEnablePopBatchAck()) { + // store checkpoint + if (brokerController.getBrokerConfig().isEnablePopBatchAck()) { // default is false List indexList = this.batchAckIndexList; try { for (byte i = 0; i < point.getNum(); i++) { // reput buffer ak to store + // if checkpoint is acked and not stored, add to indexList if (DataConverter.getBit(pointWrapper.getBits().get(), i) && !DataConverter.getBit(pointWrapper.getToStoreBits().get(), i)) { indexList.add(i); @@ -314,6 +425,7 @@ private void scan() { } else { for (byte i = 0; i < point.getNum(); i++) { // reput buffer ak to store + // if checkpoint is acked and not stored, call putAckToStore if (DataConverter.getBit(pointWrapper.getBits().get(), i) && !DataConverter.getBit(pointWrapper.getToStoreBits().get(), i)) { putAckToStore(pointWrapper, i, count); @@ -321,6 +433,7 @@ private void scan() { } } + // remove checkpoint from buffer if (isCkDoneForFinish(pointWrapper) && pointWrapper.isCkStored()) { if (brokerController.getBrokerConfig().isEnablePopLog()) { POP_LOGGER.info("[PopBuffer]ck finish, {}", pointWrapper); @@ -331,8 +444,10 @@ private void scan() { } } + // scan commitOffsets and commit offset which is needed. int offsetBufferSize = scanCommitOffset(); + // calculate scan times long eclipse = System.currentTimeMillis() - startTime; if (eclipse > brokerController.getBrokerConfig().getPopCkStayBufferTimeOut() - 1000) { POP_LOGGER.warn("[PopBuffer]scan stop, because eclipse too long, PopBufferEclipse={}, " + @@ -370,6 +485,15 @@ public int getBufferedCKSize() { return this.counter.get(); } + /** + * Atomically set the bit at {@code index} in an {@link AtomicInteger} bitmask. + * + *

Uses a CAS (compare-and-swap) loop to ensure thread safety without locking. + * If the bit is already set, this method returns immediately (no-op). + * + * @param setBits the atomic bitmask to update + * @param index the bit position (0-based) + */ private void markBitCAS(AtomicInteger setBits, int index) { while (true) { int bits = setBits.get(); @@ -384,6 +508,22 @@ private void markBitCAS(AtomicInteger setBits, int index) { } } + /** + * Commit the consumer offset for the checkpoint's {@code topic@cid@queueId}. + * + *

Called from {@link #scanCommitOffset()} after the checkpoint is confirmed + * as finished (all acks received or CK stored). The offset is advanced to + * {@link PopCheckPointWrapper#nextBeginOffset}, which is the offset of the + * first message after this batch. + * + *

The operation is guarded by {@link PopMessageProcessor.QueueLockManager} + * to prevent concurrent offset updates on the same queue. + * + * @param wrapper the finished checkpoint wrapper + * @return {@code true} if the offset was committed or no commit is needed + * ({@code nextBeginOffset < 0}); {@code false} if the lock could + * not be acquired (caller should retry later) + */ private boolean commitOffset(final PopCheckPointWrapper wrapper) { if (wrapper.getNextBeginOffset() < 0) { return true; @@ -413,8 +553,25 @@ private boolean commitOffset(final PopCheckPointWrapper wrapper) { return true; } + /** + * Enqueue the checkpoint wrapper into the per-{@code topic@cid@queueId} offset queue + * for sequential offset committing. + * + *

The queue is maintained in FIFO order. The {@link #scanCommitOffset()} method + * drains the queue from the head, ensuring that offsets are committed in the same + * order as the checkpoints were created, which prevents consumer offset regression. + * + *

The {@link QueueWithTime#time} is also updated to the CK's pop time so that + * {@link #scanGarbage()} can identify and remove stale entries after 5 minutes of + * inactivity. + * + * @param pointWrapper the checkpoint wrapper to enqueue + * @return true if the element was added to the queue successfully + */ private boolean putOffsetQueue(PopCheckPointWrapper pointWrapper) { QueueWithTime queue = this.commitOffsets.get(pointWrapper.getLockKey()); + + // init with empty queue if (queue == null) { queue = new QueueWithTime<>(); QueueWithTime old = this.commitOffsets.putIfAbsent(pointWrapper.getLockKey(), queue); @@ -422,6 +579,7 @@ private boolean putOffsetQueue(PopCheckPointWrapper pointWrapper) { queue = old; } } + queue.setTime(pointWrapper.getCk().getPopTime()); return queue.get().offer(pointWrapper); } @@ -436,12 +594,13 @@ private boolean checkQueueOk(PopCheckPointWrapper pointWrapper) { /** * put to store && add to buffer. + * addAndStoreCheckpoint maybe a better name. * - * @param point - * @param reviveQueueId - * @param reviveQueueOffset - * @param nextBeginOffset - * @return + * @param point check point + * @param reviveQueueId revive queueId + * @param reviveQueueOffset revive queueOffset + * @param nextBeginOffset next offset + * @return true if success */ public boolean addCkJustOffset(PopCheckPoint point, int reviveQueueId, long reviveQueueOffset, long nextBeginOffset) { @@ -454,6 +613,8 @@ public boolean addCkJustOffset(PopCheckPoint point, int reviveQueueId, long revi return false; } + // called before buffer operation + // because store operation will update attributes of pointWrapper this.putCkToStore(pointWrapper, checkQueueOk(pointWrapper)); putOffsetQueue(pointWrapper); @@ -465,8 +626,17 @@ public boolean addCkJustOffset(PopCheckPoint point, int reviveQueueId, long revi return true; } + /** + * mock checkpoint then add it to offset queue. + * this method is called when popped message is: + * - NO_MATCHED_MESSAGE + * - OFFSET_FOUND_NULL + * - MESSAGE_WAS_REMOVING + * - NO_MATCHED_LOGIC_QUEUE + */ public void addCkMock(String group, String topic, int queueId, long startOffset, long invisibleTime, long popTime, int reviveQueueId, long nextBeginOffset, String brokerName) { + // create checkpoint final PopCheckPoint ck = new PopCheckPoint(); ck.setBitMap(0); ck.setNum((byte) 0); @@ -482,12 +652,17 @@ public void addCkMock(String group, String topic, int queueId, long startOffset, pointWrapper.setCkStored(true); putOffsetQueue(pointWrapper); + if (brokerController.getBrokerConfig().isEnablePopLog()) { POP_LOGGER.info("[PopBuffer]add ck just offset, mocked, {}", pointWrapper); } } + /** + * add checkpoint to buffer. + */ public boolean addCk(PopCheckPoint point, int reviveQueueId, long reviveQueueOffset, long nextBeginOffset) { + // validate env and checkpoint // key: point.getT() + point.getC() + point.getQ() + point.getSo() + point.getPt() if (!brokerController.getBrokerConfig().isEnablePopBufferMerge()) { return false; @@ -531,14 +706,39 @@ public boolean addCk(PopCheckPoint point, int reviveQueueId, long reviveQueueOff return true; } + /** + * Merge a consumer ack into the buffered checkpoint. + * + *

The ack is not written to the revive topic immediately. Instead, a flag is + * set in {@link PopCheckPointWrapper#bits} via {@link #markBitCAS}. + * The pending ack will later be flushed to storage by {@link #scan()} when the + * checkpoint is evicted (timeout / buffer full / service stopping). + * + *

Rejection conditions (return false): + *

    + *
  • {@code enablePopBufferMerge} is disabled
  • + *
  • The service is not serving (too busy)
  • + *
  • No matching checkpoint found in {@link #buffer}
  • + *
  • The checkpoint is a {@code justOffset} entry (no messages to ack)
  • + *
  • The checkpoint is too close to its revive deadline
  • + *
  • The checkpoint has been buffered for too long
  • + *
+ * + * @param reviveQid revive queue id (used only for logging) + * @param ackMsg the ack message from the consumer + * @return true if the ack was merged successfully + */ public boolean addAk(int reviveQid, AckMsg ackMsg) { + // validate env if (!brokerController.getBrokerConfig().isEnablePopBufferMerge()) { return false; } if (!serving) { return false; } + try { + // get and validate checkpoint PopCheckPointWrapper pointWrapper = this.buffer.get(ackMsg.getTopic() + ackMsg.getConsumerGroup() + ackMsg.getQueueId() + ackMsg.getStartOffset() + ackMsg.getPopTime() + ackMsg.getBrokerName()); if (pointWrapper == null) { if (brokerController.getBrokerConfig().isEnablePopLog()) { @@ -568,7 +768,8 @@ public boolean addAk(int reviveQid, AckMsg ackMsg) { return false; } - if (ackMsg instanceof BatchAckMsg) { + // merge ackMsg with checkpoint + if (ackMsg instanceof BatchAckMsg) { // merge batch ackMsg for (Long ackOffset : ((BatchAckMsg) ackMsg).getAckOffsetList()) { int indexOfAck = point.indexOfAck(ackOffset); if (indexOfAck > -1) { @@ -577,7 +778,7 @@ public boolean addAk(int reviveQid, AckMsg ackMsg) { POP_LOGGER.error("[PopBuffer]Invalid index of ack, reviveQid={}, {}, {}", reviveQid, ackMsg, point); } } - } else { + } else { // merge ackMsg int indexOfAck = point.indexOfAck(ackMsg.getAckOffset()); if (indexOfAck > -1) { markBitCAS(pointWrapper.getBits(), indexOfAck); @@ -587,6 +788,7 @@ public boolean addAk(int reviveQid, AckMsg ackMsg) { } } + // logging if (brokerController.getBrokerConfig().isEnablePopLog()) { POP_LOGGER.info("[PopBuffer]add ack, rqId={}, {}, {}", reviveQid, pointWrapper, ackMsg); } @@ -608,6 +810,12 @@ public void clearOffsetQueue(String lockKey) { this.commitOffsets.remove(lockKey); } + /** + * write message(checkpoint) to revive topic, then update pointWrapper related info. + * + * @param pointWrapper checkpoint + * @param runInCurrent async or sync + */ private void putCkToStore(final PopCheckPointWrapper pointWrapper, final boolean runInCurrent) { if (pointWrapper.getReviveQueueOffset() >= 0) { return; @@ -617,6 +825,7 @@ private void putCkToStore(final PopCheckPointWrapper pointWrapper, final boolean // Indicates that ck message is storing pointWrapper.setReviveQueueOffset(Long.MAX_VALUE); + // default value of isAppendCkAsync is false if (brokerController.getBrokerConfig().isAppendCkAsync() && runInCurrent) { brokerController.getEscapeBridge().asyncPutMessageToSpecificQueue(msgInner).thenAccept(putMessageResult -> { handleCkMessagePutResult(putMessageResult, pointWrapper); @@ -655,7 +864,21 @@ private void handleCkMessagePutResult(PutMessageResult putMessageResult, final P } } + /** + * Persist message which created by checkpoint to the revive topic. + * + *
    + *
  • create message by checkpoint
  • + *
  • write message to revive topic
  • + *
  • update pointWrapper related info
  • + *
+ * + * @param pointWrapper the checkpoint wrapper containing the original CK + * @param msgIndex the sub-message index within the CK batch to ack + * @param count atomic counter incremented on successful persistence + */ private void putAckToStore(final PopCheckPointWrapper pointWrapper, byte msgIndex, AtomicInteger count) { + // build ackMsg and Message by checkpoint PopCheckPoint point = pointWrapper.getCk(); MessageExtBrokerInner msgInner = new MessageExtBrokerInner(); final AckMsg ackMsg = new AckMsg(); @@ -679,7 +902,8 @@ private void putAckToStore(final PopCheckPointWrapper pointWrapper, byte msgInde msgInner.setPropertiesString(MessageDecoder.messageProperties2String(msgInner.getProperties())); - if (brokerController.getBrokerConfig().isAppendAckAsync()) { + // store message then change store status of the checkpoint + if (brokerController.getBrokerConfig().isAppendAckAsync()) { // default value is false brokerController.getEscapeBridge().asyncPutMessageToSpecificQueue(msgInner).thenAccept(putMessageResult -> { handleAckPutMessageResult(ackMsg, putMessageResult, pointWrapper, count, msgIndex); }).exceptionally(throwable -> { @@ -687,11 +911,22 @@ private void putAckToStore(final PopCheckPointWrapper pointWrapper, byte msgInde return null; }); } else { + // store message PutMessageResult putMessageResult = brokerController.getEscapeBridge().putMessageToSpecificQueue(msgInner); + // change store status of the checkpoint handleAckPutMessageResult(ackMsg, putMessageResult, pointWrapper, count, msgIndex); } } + /** + * update store status of checkpoint if revive message stored successfully. + * + * @param ackMsg the ack message that was persisted + * @param putMessageResult the result returned by the store + * @param pointWrapper the checkpoint wrapper being processed + * @param count atomic counter incremented on success + * @param msgIndex the sub-message index that was persisted + */ private void handleAckPutMessageResult(AckMsg ackMsg, PutMessageResult putMessageResult, PopCheckPointWrapper pointWrapper, AtomicInteger count, byte msgIndex) { brokerController.getBrokerMetricsManager().getPopMetricsManager().incPopReviveAckPutCount(ackMsg, putMessageResult.getPutMessageStatus()); @@ -797,6 +1032,17 @@ private boolean cancelCkTimer(final PopCheckPointWrapper pointWrapper) { return true; } + /** + * Check whether all sub-messages in the checkpoint have been acked. + * + *

Every sub-message has a corresponding bit in + * {@link PopCheckPointWrapper#bits}. This method returns {@code true} when + * all bits are set, meaning the CK can be removed from the buffer without + * writing any ack to the revive topic (clean completion). + * + * @param pointWrapper the checkpoint wrapper to check + * @return {@code true} if every sub-message has been acked + */ private boolean isCkDone(PopCheckPointWrapper pointWrapper) { byte num = pointWrapper.getCk().getNum(); for (byte i = 0; i < num; i++) { @@ -807,6 +1053,18 @@ private boolean isCkDone(PopCheckPointWrapper pointWrapper) { return true; } + /** + * Check whether all acked sub-messages have been fully persisted. + * + *

Uses XOR: {@code bits ^ toStoreBits}. A bit is set in the result when + * the corresponding sub-message has been acked ({@code bits}) but not yet + * persisted ({@code toStoreBits}). Returns {@code true} only when every + * acked message has also been persisted, meaning the checkpoint is ready + * for final cleanup. + * + * @param pointWrapper the checkpoint wrapper to check + * @return {@code true} if no ack remains to be persisted + */ private boolean isCkDoneForFinish(PopCheckPointWrapper pointWrapper) { byte num = pointWrapper.getCk().getNum(); int bits = pointWrapper.getBits().get() ^ pointWrapper.getToStoreBits().get(); @@ -842,17 +1100,46 @@ public LinkedBlockingDeque get() { public class PopCheckPointWrapper { private final int reviveQueueId; - // -1: not stored, >=0: stored, Long.MAX: storing. + /** + * The consume queue offset of the CK message in the revive topic. + * + *

Three-state indicator: + *

    + *
  • {@code -1} — not yet stored; {@link #putCkToStore} will write it
  • + *
  • {@code >= 0} — successfully stored; the value is the offset in the + * revive topic's consume queue
  • + *
  • {@link Long#MAX_VALUE} — a write is in progress (prevents duplicate + * writes from concurrent scans)
  • + *
+ */ private volatile long reviveQueueOffset; private final PopCheckPoint ck; - // bit for concurrent + // store ack states of messages, one byte for each message private final AtomicInteger bits; - // bit for stored buffer ak + // bits for stored buffer ak, one byte for each message private final AtomicInteger toStoreBits; + // nextOffset of original topic private final long nextBeginOffset; + // topic@group@queueId private final String lockKey; + // topic + group + queueId + startOffset + popTime + brokerName private final String mergeKey; + /** + * Whether this checkpoint should be written to the revive topic directly. + * + *

When {@code true}: + *

    + *
  • The CK has already been or will be written to the revive topic directly
  • + *
  • No Ack merging is needed — {@link #addAk} rejects these entries
  • + *
  • The wrapper exists solely to maintain FIFO offset commit order in + * {@link #commitOffsets}
  • + *
+ * + * @see PopBufferMergeService#addCkJustOffset + * @see PopBufferMergeService#addCkMock + */ private final boolean justOffset; + // whether check point has stored in revive queue private volatile boolean ckStored = false; public PopCheckPointWrapper(int reviveQueueId, long reviveQueueOffset, PopCheckPoint point, diff --git a/broker/src/main/java/org/apache/rocketmq/broker/processor/PopMessageProcessor.java b/broker/src/main/java/org/apache/rocketmq/broker/processor/PopMessageProcessor.java index 55cabe6f5e5..dfbe2f40bfd 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/processor/PopMessageProcessor.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/processor/PopMessageProcessor.java @@ -33,6 +33,7 @@ import org.apache.rocketmq.broker.longpolling.PopRequest; import org.apache.rocketmq.broker.pagecache.ManyMessageTransfer; import org.apache.rocketmq.broker.pop.PopConsumerContext; +import org.apache.rocketmq.broker.pop.PopConsumerService; import org.apache.rocketmq.common.BrokerConfig; import org.apache.rocketmq.common.KeyBuilder; import org.apache.rocketmq.common.MixAll; @@ -99,6 +100,24 @@ import static org.apache.rocketmq.remoting.metrics.RemotingMetricsConstant.LABEL_RESPONSE_CODE; import static org.apache.rocketmq.remoting.metrics.RemotingMetricsConstant.LABEL_RESULT; +/** + * Processes PopMessage requests from consumers. + * + *

This is the core processor for the Pop consumption mode. It handles: + *

    + *
  • Validating the request (topic, group, queue, subscription, permissions)
  • + *
  • Routing to the {@link PopConsumerService} (KVStore path) or the + * inline file-based path
  • + *
  • Popping messages from normal and retry topics (V1/V2)
  • + *
  • Creating checkpoints and writing them to the revive topic
  • + *
  • Long-polling suspension via {@link PopLongPollingService}
  • + *
  • Transferring messages to the client (heap copy or zero-copy)
  • + *
+ * + *

This class also owns the {@link PopLongPollingService}, + * {@link PopBufferMergeService}, and {@link QueueLockManager} instances + * used by the file-based ack path. + */ public class PopMessageProcessor implements NettyRequestProcessor { private static final Logger POP_LOGGER = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LOGGER_NAME); @@ -217,13 +236,36 @@ public void notifyMessageArriving(final String topic, final int queueId, final S topic, queueId, cid, false, null, 0L, null, null); } + /** + * Process a PopMessage request. + * + *

This method handles the full Pop lifecycle: + *

    + *
  1. Validates the request (topic, group, permissions, subscription)
  2. + *
  3. Routes to the KVStore path (via {@link PopConsumerService#popAsync}) + * or the file-based path (inline CompletableFuture chain)
  4. + *
  5. Pops messages from normal and retry topics (V1/V2)
  6. + *
  7. Creates checkpoints and appends them to the revive topic
  8. + *
  9. Suspends the request via {@link PopLongPollingService#polling} if + * no messages are available
  10. + *
  11. Transfers messages via heap copy or zero-copy ({@code FileRegion})
  12. + *
+ * + * @param ctx the Netty channel handler context + * @param request the incoming PopMessage request + * @return the response, or {@code null} if the response is sent asynchronously + * (zero-copy path or long-polling suspension) + * @throws RemotingCommandException if the request cannot be decoded + */ @Override public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingCommand request) throws RemotingCommandException { + // init request and response final long beginTimeMills = this.brokerController.getMessageStore().now(); Channel channel = ctx.channel(); + RemotingCommand response = RemotingCommand.createResponseCommand(PopMessageResponseHeader.class); response.setOpaque(request.getOpaque()); @@ -235,6 +277,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC } final PopMessageResponseHeader responseHeader = (PopMessageResponseHeader) response.readCustomHeader(); + // validation // Pop mode only supports consumption in cluster load balancing mode brokerController.getConsumerManager().compensateBasicConsumerInfo( requestHeader.getConsumerGroup(), ConsumeType.CONSUME_POP, MessageModel.CLUSTERING); @@ -314,6 +357,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC return response; } + // init filter BrokerConfig brokerConfig = brokerController.getBrokerConfig(); SubscriptionData subscriptionData = null; ExpressionMessageFilter messageFilter = null; @@ -377,6 +421,9 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC ExpressionMessageFilter finalMessageFilter = messageFilter; SubscriptionData finalSubscriptionData = subscriptionData; + // There are two type of ack mode: + // 1. ack by KV service + // 2. ack by file merge service, default mode if (brokerConfig.isPopConsumerKVServiceEnable()) { CompletableFuture popAsyncFuture = brokerController.getPopConsumerService().popAsync( @@ -386,6 +433,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC requestHeader.getAttemptId(), requestHeader.getInitMode(), messageFilter); popAsyncFuture.thenApply(result -> { + // callback try { if (request.getCallbackList() != null) { request.getCallbackList().forEach(CommandCallback::accept); @@ -395,6 +443,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC POP_LOGGER.error("PopProcessor execute callback error", t); } + // long polling process, useless in rocketmq 5.* if (result.isFound()) { response.setCode(ResponseCode.SUCCESS); getMessageResult.setStatus(GetMessageStatus.FOUND); @@ -427,6 +476,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC getMessageResult.setStatus(GetMessageStatus.NO_MESSAGE_IN_QUEUE); } + // format response responseHeader.setPopTime(result.getPopTime()); responseHeader.setInvisibleTime(result.getInvisibleTime()); responseHeader.setReviveQid( @@ -487,8 +537,10 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC return response; }).thenAccept(result -> NettyRemotingAbstract.writeResponse(channel, request, result, null, brokerController.getBrokerMetricsManager().getRemotingMetricsManager())); return null; - } + } // end of ack by kv service + // start of ack by file merge service mode + // init pop parameters int randomQ = random.nextInt(100); int reviveQid; if (requestHeader.isOrder()) { @@ -518,7 +570,10 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC } randomQ = usePriorityMode ? 0 : randomQ; // reset randomQ long popTime = System.currentTimeMillis(); + + // pop message CompletableFuture getMessageFuture = CompletableFuture.completedFuture(0L); + // pop message from retry topic if (needRetry && !requestHeader.isOrder()) { if (needRetryV1) { String retryTopic = KeyBuilder.buildPopRetryTopicV1(requestHeader.getTopic(), requestHeader.getConsumerGroup()); @@ -530,6 +585,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC popTime, finalMessageFilter, startOffsetInfo, msgOffsetInfo, orderCountInfo, randomQ, getMessageFuture); } } + if (requestHeader.getQueueId() < 0) { // read all queue getMessageFuture = popMsgFromTopic(topicConfig, false, getMessageResult, requestHeader, reviveQid, channel, @@ -541,6 +597,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC getMessageResult, requestHeader, queueId, restNum, reviveQid, channel, popTime, finalMessageFilter, startOffsetInfo, msgOffsetInfo, orderCountInfo)); } + // if not full , fetch retry again if (!needRetry && getMessageResult.getMessageMapedList().size() < requestHeader.getMaxMsgNums() && !requestHeader.isOrder()) { if (needRetryV1) { @@ -554,8 +611,10 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC } } + // async result handle final RemotingCommand finalResponse = response; getMessageFuture.thenApply(restNum -> { + // execute callback try { if (request.getCallbackList() != null) { request.getCallbackList().forEach(CommandCallback::accept); @@ -565,6 +624,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC POP_LOGGER.error("PopProcessor execute callback error", t); } + // long polling used in version 4.*, useless in 5.* if (!getMessageResult.getMessageBufferList().isEmpty()) { finalResponse.setCode(ResponseCode.SUCCESS); getMessageResult.setStatus(GetMessageStatus.FOUND); @@ -591,6 +651,8 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC } getMessageResult.setStatus(GetMessageStatus.NO_MESSAGE_IN_QUEUE); } + + // format response responseHeader.setInvisibleTime(requestHeader.getInvisibleTime()); responseHeader.setPopTime(popTime); responseHeader.setReviveQid(reviveQid); @@ -601,6 +663,9 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC responseHeader.setOrderCountInfo(orderCountInfo.toString()); } finalResponse.setRemark(getMessageResult.getStatus().name()); + + // transfer msg by heap or zero copy, + // zero copy used in 4.*, useless in 5.* switch (finalResponse.getCode()) { case ResponseCode.SUCCESS: if (this.brokerController.getBrokerConfig().isTransferMsgByHeap()) { @@ -610,7 +675,7 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC requestHeader.getTopic(), requestHeader.getQueueId(), (int) (this.brokerController.getMessageStore().now() - beginTimeMills)); finalResponse.setBody(r); - } else { + } else { // zero copy final GetMessageResult tmpGetMessageResult = getMessageResult; try { FileRegion fileRegion = @@ -647,6 +712,37 @@ public RemotingCommand processRequest(final ChannelHandlerContext ctx, RemotingC return null; } + /** + * Pop messages from every read queue of the given topic. + * + *

Queues are visited sequentially (respecting {@code priorityOrderAsc}). + * For each queue a {@link #popMsgFromQueue} call is chained via + * {@code CompletableFuture#thenCompose}. The chained future carries the + * remaining number of messages still needed ({@code restNum}). + * + *

Early termination can occur inside {@link #popMsgFromQueue} when: + *

    + *
  • the queue lock cannot be acquired
  • + *
  • too many in-flight (un-acked) messages exist
  • + *
  • an order queue is blocked
  • + *
  • the accumulated message count already reaches {@code maxMsgNums}
  • + *
+ * + * @param topicConfig topic configuration; {@code null} skips all queues + * @param isRetry whether the topic is a retry topic + * @param getMessageResult accumulator for the messages popped so far + * @param requestHeader pop request parameters + * @param reviveQid revive queue id + * @param channel netty channel of the requesting client + * @param popTime pop timestamp + * @param messageFilter expression filter applied to each message + * @param startOffsetInfo buffer for offset tracing info + * @param msgOffsetInfo buffer for per-message offset tracing info + * @param orderCountInfo buffer for order-consume count info + * @param randomQ random queue offset for round-robin load balancing + * @param getMessageFuture future that carries the remaining message count + * @return a future completing with the remaining number of messages needed + */ private CompletableFuture popMsgFromTopic(TopicConfig topicConfig, boolean isRetry, GetMessageResult getMessageResult, PopMessageRequestHeader requestHeader, int reviveQid, Channel channel, long popTime, ExpressionMessageFilter messageFilter, StringBuilder startOffsetInfo, @@ -674,12 +770,52 @@ private CompletableFuture popMsgFromTopic(String topic, boolean isRetry, G messageFilter, startOffsetInfo, msgOffsetInfo, orderCountInfo, randomQ, getMessageFuture); } + /** + * Pop messages from a specific queue of a topic. + * + *

This method is called as a step in a {@link CompletableFuture} chain + * (see {@link #popMsgFromTopic}). The {@code restNum} argument is the + * number of messages still needed — when it drops to {@code 0} or below, + * subsequent calls in the chain may short-circuit early. + * + *

The method has several early-termination paths (all return + * immediately with the current {@code restNum}): + *

    + *
  • Queue lock cannot be acquired — skips this queue
  • + *
  • Too many in-flight (un-acked) messages for this + * {@code topic@group@queueId}
  • + *
  • Order queue is blocked by a previous un-acked message
  • + *
  • Already accumulated {@code >= maxMsgNums} messages
  • + *
+ * + *

Otherwise, it asynchronously fetches messages from the store, handles + * offset correction, updates order-consume tracking / checkpoint data, and + * merges the results into {@code getMessageResult}. + * + * @param topic topic name + * @param attemptId attempt id for idempotent consumption + * @param isRetry whether this is a retry topic + * @param getMessageResult accumulator for messages popped so far + * @param requestHeader pop request parameters + * @param queueId target queue id + * @param restNum number of messages still needed before the batch + * size is satisfied + * @param reviveQid revive queue id for checkpoint + * @param channel netty channel of the requesting client + * @param popTime pop invocation timestamp + * @param messageFilter expression filter applied to each message + * @param startOffsetInfo buffer for offset tracing info + * @param msgOffsetInfo buffer for per-message offset tracing info + * @param orderCountInfo buffer for order-consume count info + * @return a future completing with the remaining number of messages needed + */ private CompletableFuture popMsgFromQueue(String topic, String attemptId, boolean isRetry, GetMessageResult getMessageResult, PopMessageRequestHeader requestHeader, int queueId, long restNum, int reviveQid, Channel channel, long popTime, ExpressionMessageFilter messageFilter, StringBuilder startOffsetInfo, StringBuilder msgOffsetInfo, StringBuilder orderCountInfo) { + // get pop offset String lockKey = topic + PopAckConstants.SPLIT + requestHeader.getConsumerGroup() + PopAckConstants.SPLIT + queueId; boolean isOrder = requestHeader.isOrder(); @@ -693,6 +829,7 @@ private CompletableFuture popMsgFromQueue(String topic, String attemptId, return failure; } + // try lock CompletableFuture future = new CompletableFuture<>(); if (!queueLockManager.tryLock(lockKey)) { try { @@ -705,8 +842,9 @@ private CompletableFuture popMsgFromQueue(String topic, String attemptId, } return future; } - future.whenComplete((result, throwable) -> queueLockManager.unLock(lockKey)); + + // check inflight message number if (isPopShouldStop(topic, requestHeader.getConsumerGroup(), queueId)) { POP_LOGGER.warn("Too much msgs unacked, then stop popping. topic={}, group={}, queueId={}", topic, requestHeader.getConsumerGroup(), queueId); @@ -719,6 +857,7 @@ private CompletableFuture popMsgFromQueue(String topic, String attemptId, return future; } + // check orderly lock and max message number try { offset = getPopOffset(topic, requestHeader.getConsumerGroup(), queueId, requestHeader.getInitMode(), true, lockKey, true); @@ -759,6 +898,7 @@ private CompletableFuture popMsgFromQueue(String topic, String attemptId, return this.brokerController.getMessageStore() .getMessageAsync(requestHeader.getConsumerGroup(), topic, queueId, offset, requestHeader.getMaxMsgNums() - getMessageResult.getMessageMapedList().size(), messageFilter) + // result check and retry if offset is not correct .thenCompose(result -> { if (result == null) { return CompletableFuture.completedFuture(null); @@ -779,7 +919,9 @@ private CompletableFuture popMsgFromQueue(String topic, String attemptId, requestHeader.getMaxMsgNums() - getMessageResult.getMessageMapedList().size(), messageFilter); } return CompletableFuture.completedFuture(result); - }).thenApply(result -> { + }) + // update order info or append checkpoint then format result + .thenApply(result -> { if (result == null) { try { atomicRestNum.set(brokerController.getMessageStore().getMaxOffsetInQueue(topic, queueId) - atomicOffset.get() + atomicRestNum.get()); @@ -871,7 +1013,9 @@ private CompletableFuture popMsgFromQueue(String topic, String attemptId, result.getMessageCount() ); return atomicRestNum.get(); - }).whenComplete((result, throwable) -> { + }) + // unlock queueLock + .whenComplete((result, throwable) -> { if (throwable != null) { POP_LOGGER.error("Pop message error, {}", lockKey, throwable); } @@ -884,14 +1028,37 @@ private boolean isPopShouldStop(String topic, String group, int queueId) { brokerController.getPopInflightMessageCounter().getGroupPopInFlightMessageNum(topic, group, queueId) > brokerController.getBrokerConfig().getPopInflightMessageThreshold(); } + /** + * get consume offset for pop mode + * called by: + * - this.popMsgFromQueue() + * functionality: + * - return resetOffset if exists + * - get offset if exists + * - init offset if not exists + * - get offset from popBufferMergeService + * + * @param topic topic + * @param group group + * @param queueId queueId + * @param initMode initMode ConsumeInitMode.MAX for pop mode + * @param init flag of whether commit offset the first time pop message + * @param lockKey lockKey + * @param checkResetOffset flag of whether resetPopOffset + * @return offset + */ private long getPopOffset(String topic, String group, int queueId, int initMode, boolean init, String lockKey, boolean checkResetOffset) throws ConsumeQueueException { long offset = this.brokerController.getConsumerOffsetManager().queryOffset(group, topic, queueId); if (offset < 0) { + //the first time consume, init offset by initMode offset = this.getInitOffset(topic, group, queueId, initMode, init); } + // before lock checkResetOffset is false + // after lock checkResetOffset is true + // This is an admin related feature if (checkResetOffset) { Long resetOffset = resetPopOffset(topic, group, queueId); if (resetOffset != null) { @@ -907,6 +1074,14 @@ private long getPopOffset(String topic, String group, int queueId, int initMode, } } + /** + * get offset from consume queue + * If consume from min offset: + * - return min offset. + * If consume from max offset: + * - get max offset + * - commit max offset if init is true. + */ public long getInitOffset(String topic, String group, int queueId, int initMode, boolean init) throws ConsumeQueueException { long offset; diff --git a/broker/src/main/java/org/apache/rocketmq/broker/processor/PopReviveService.java b/broker/src/main/java/org/apache/rocketmq/broker/processor/PopReviveService.java index 07f16e98965..a6048c39ed7 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/processor/PopReviveService.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/processor/PopReviveService.java @@ -64,6 +64,25 @@ import static org.apache.rocketmq.broker.metrics.BrokerMetricsConstant.LABEL_IS_SYSTEM; import static org.apache.rocketmq.broker.metrics.BrokerMetricsConstant.LABEL_TOPIC; +/** + * Per-queue service that reads the revive topic, matches checkpoints with AckMsgs, and + * revives timed-out messages by re-publishing them to the retry topic. + * + *

There is only one public method for business: run

+ * + *

Each revive queue has its own dedicated {@code PopReviveService} instance. + * The service periodically: + *

    + *
  1. Scans the revive topic ({@link #consumeReviveMessage}) to collect CK + * (checkpoint) and Ack messages, merging Acks into CK's bitMap
  2. + *
  3. Processes expired checkpoints ({@link #mergeAndRevive}) by re-publishing any + * un-acked sub-messages back to the retry topic via {@link #reviveRetry}
  4. + *
+ * + *

This is the file-based revive path (CK + Ack messages are stored in + * the system revive topic). It is complemented by the KVStore-based path in + * {@code PopConsumerService} which handles the {@code PopConsumerKVStore} flow. + */ public class PopReviveService extends ServiceThread { private static final Logger POP_LOGGER = LoggerFactory.getLogger(LoggerName.ROCKETMQ_POP_LOGGER_NAME); private final int[] ckRewriteIntervalsInSeconds = new int[] { 10, 20, 30, 60, 120, 180, 240, 300, 360, 420, 480, 540, 600, 1200, 1800, 3600, 7200 }; @@ -74,6 +93,25 @@ public class PopReviveService extends ServiceThread { private long currentReviveMessageTimestamp = -1; private volatile boolean shouldRunPopRevive = false; + /** + * Tracks checkpoints that are currently being revived. + * + *

Key — the checkpoint being processed. + * Value — a pair of (startTime, completed), where: + *

    + *
  • {@code startTime} is the timestamp when revival began
  • + *
  • {@code completed} is {@code true} once all sub-messages have been + * processed (success or failure)
  • + *
+ * + *

The map is sorted by {@link PopCheckPoint#compareTo} (by startOffset). + * This ordering is used to drain completed entries from the head, ensuring + * the revive topic offset is committed strictly in sequence. + * + *

Concurrency is limited to at most 3 entries at a time (see + * {@link #mergeAndRevive}). If an entry stays incomplete for over 30 + * seconds, it is considered hung and is skipped via {@link #rePutCK}. + */ private final NavigableMap> inflightReviveRequestMap = Collections.synchronizedNavigableMap(new TreeMap<>()); private long reviveOffset; @@ -104,7 +142,23 @@ public boolean isShouldRunPopRevive() { return shouldRunPopRevive; } + /** + * Re-publish a timed-out message to the retry topic. + * + *

Constructs a new {@link MessageExtBrokerInner} from the original + * message, increments the reconsume count (unless suspended), sets the + * first-pop time and origin group properties, and writes it to the + * appropriate retry topic (V1 or V2 depending on configuration). + * + *

If the retry topic does not exist, it is created automatically + * via {@link #addRetryTopicIfNotExist}. + * + * @param popCheckPoint the checkpoint that triggered the revive + * @param messageExt the original message to re-publish + * @return {@code true} if the message was written successfully + */ private boolean reviveRetry(PopCheckPoint popCheckPoint, MessageExt messageExt) { + // convert checkpoint to inner message MessageExtBrokerInner msgInner = new MessageExtBrokerInner(); if (!popCheckPoint.getTopic().startsWith(MixAll.RETRY_GROUP_TOPIC_PREFIX)) { msgInner.setTopic(KeyBuilder.buildPopRetryTopic(popCheckPoint.getTopic(), popCheckPoint.getCId(), brokerController.getBrokerConfig().isEnableRetryTopicV2())); @@ -133,9 +187,15 @@ private boolean reviveRetry(PopCheckPoint popCheckPoint, MessageExt messageExt) } msgInner.getProperties().put(MessageConst.PROPERTY_ORIGIN_GROUP, popCheckPoint.getCId()); msgInner.setPropertiesString(MessageDecoder.messageProperties2String(msgInner.getProperties())); + + // set topic and queueId addRetryTopicIfNotExist(msgInner.getTopic(), popCheckPoint.getCId()); msgInner.setQueueId(getRetryQueueId(msgInner.getTopic(), messageExt)); + + // store message PutMessageResult putMessageResult = brokerController.getEscapeBridge().putMessageToSpecificQueue(msgInner); + + // logging and metric brokerController.getBrokerMetricsManager().getPopMetricsManager().incPopReviveRetryMessageCount(popCheckPoint, putMessageResult.getPutMessageStatus()); if (brokerController.getBrokerConfig().isEnablePopLog()) { POP_LOGGER.info("reviveQueueId={},retry msg, ck={}, msg queueId {}, offset {}, reviveDelay={}, result is {} ", @@ -205,6 +265,17 @@ private int getRetryQueueId(String retryTopic, MessageExt messageExt) { return oriQueueId; } + /** + * Pull a batch of messages from the revive topic at the given offset. + * + *

If the offset becomes illegal (e.g. the revive topic was truncated), + * the revive offset is corrected to {@code nextBeginOffset - 1} so that + * the next scan starts from a valid position. + * + * @param offset the queue offset to start reading from + * @param queueId the revive queue id + * @return a list of decoded messages, or {@code null} if at the tail + */ protected List getReviveMessage(long offset, int queueId) { PullResult pullResult = getMessage(PopAckConstants.REVIVE_GROUP, reviveTopic, queueId, offset, 32, true); if (pullResult == null) { @@ -333,7 +404,39 @@ private List decodeMsgList(GetMessageResult getMessageResult, boolea return foundList; } + /** + * Pull Message from revive topic then transfer to checkpoint and ack messages. + * + *

This method reads messages from the revive topic starting from the + * current offset. Each message is classified by its tag: + *

    + *
  • {@link PopAckConstants#CK_TAG} — a checkpoint, deserialized from + * JSON and stored in the map by its merge key
  • + *
  • {@link PopAckConstants#ACK_TAG} or + * {@link PopAckConstants#BATCH_ACK_TAG} — an ack, matched to its + * corresponding checkpoint via the merge key. The ack offset is translated + * to a sub-message index ({@link PopCheckPoint#indexOfAck}) and + * the checkpoint's bitMap is updated via {@link DataConverter#setBit}
  • + *
+ * + *

AckMsg that arrive after their checkpoint has already been processed + * ({@code enableSkipLongAwaitingAck}) are handled by creating a mock CK + * via {@link #mockCkForAck} so that the revive offset can still be + * committed correctly. + * + *

The scan stops when any of: + *

    + *
  • No more messages in the revive topic (tail reached)
  • + *
  • Scan time exceeds {@code reviveScanTime}
  • + *
  • The elapsed time since the first CK's revive time exceeds + * {@code ackTimeInterval + 1s}
  • + *
+ * + * @param consumeReviveObj the mutable container that receives the collected + * CKs and the computed {@code endTime} + */ protected void consumeReviveMessage(ConsumeReviveObj consumeReviveObj) { + // init context parameters HashMap map = consumeReviveObj.map; HashMap mockPointMap = new HashMap<>(); long startScanTime = System.currentTimeMillis(); @@ -346,11 +449,14 @@ protected void consumeReviveMessage(ConsumeReviveObj consumeReviveObj) { int noMsgCount = 0; long firstRt = 0; // offset self amend + while (true) { if (!shouldRunPopRevive) { POP_LOGGER.info("slave skip scan, revive topic={}, reviveQueueId={}", reviveTopic, queueId); break; } + + // pull revive messages List messageExts = getReviveMessage(offset, queueId); if (messageExts == null || messageExts.isEmpty()) { long old = endTime; @@ -379,10 +485,13 @@ protected void consumeReviveMessage(ConsumeReviveObj consumeReviveObj) { } else { noMsgCount = 0; } + if (System.currentTimeMillis() - startScanTime > brokerController.getBrokerConfig().getReviveScanTime()) { POP_LOGGER.info("reviveQueueId={}, scan timeout ", queueId); break; } + + // convert message to PopCheckPoint and AckMsg for (MessageExt messageExt : messageExts) { if (PopAckConstants.CK_TAG.equals(messageExt.getTags())) { String raw = new String(messageExt.getBody(), DataConverter.CHARSET_UTF8); @@ -411,6 +520,7 @@ protected void consumeReviveMessage(ConsumeReviveObj consumeReviveObj) { String mergeKey = ackMsg.getTopic() + ackMsg.getConsumerGroup() + ackMsg.getQueueId() + ackMsg.getStartOffset() + ackMsg.getPopTime() + brokerName; PopCheckPoint point = map.get(mergeKey); if (point == null) { + // default value of enableSkipLongAwaitingAck is false if (!brokerController.getBrokerConfig().isEnableSkipLongAwaitingAck()) { continue; } @@ -418,6 +528,7 @@ protected void consumeReviveMessage(ConsumeReviveObj consumeReviveObj) { firstRt = mockPointMap.get(mergeKey).getReviveTime(); } } else { + // merge ackMsg into checkpoint int indexOfAck = point.indexOfAck(ackMsg.getAckOffset()); if (indexOfAck > -1) { point.setBitMap(DataConverter.setBit(point.getBitMap(), indexOfAck, true)); @@ -438,6 +549,7 @@ protected void consumeReviveMessage(ConsumeReviveObj consumeReviveObj) { String mergeKey = bAckMsg.getTopic() + bAckMsg.getConsumerGroup() + bAckMsg.getQueueId() + bAckMsg.getStartOffset() + bAckMsg.getPopTime() + brokerName; PopCheckPoint point = map.get(mergeKey); if (point == null) { + // default value of enableSkipLongAwaitingAck is false if (!brokerController.getBrokerConfig().isEnableSkipLongAwaitingAck()) { continue; } @@ -445,6 +557,7 @@ protected void consumeReviveMessage(ConsumeReviveObj consumeReviveObj) { firstRt = mockPointMap.get(mergeKey).getReviveTime(); } } else { + // merge ackMsgs into checkpoint List ackOffsetList = bAckMsg.getAckOffsetList(); for (Long ackOffset : ackOffsetList) { int indexOfAck = point.indexOfAck(ackOffset); @@ -467,6 +580,20 @@ protected void consumeReviveMessage(ConsumeReviveObj consumeReviveObj) { consumeReviveObj.endTime = endTime; } + /** + * Create a mock CK for an ack whose original CK has already been processed. + * + *

When an ack arrives long after its CK has been consumed (e.g. network + * delay), the CK is no longer in the scan map. If {@code enableSkipLongAwaitingAck} + * is enabled, this method creates a synthetic CK so that the revive offset + * can still be advanced correctly in {@link #mergeAndRevive}. + * + * @param messageExt the revive topic message that carried the ack + * @param ackMsg the decoded ack + * @param mergeKey the merge key for the CK lookup + * @param mockPointMap map to collect the mock CKs + * @return {@code true} if a mock CK was created + */ private boolean mockCkForAck(MessageExt messageExt, AckMsg ackMsg, String mergeKey, HashMap mockPointMap) { long ackWaitTime = System.currentTimeMillis() - messageExt.getDeliverTimeMs(); long reviveAckWaitMs = brokerController.getBrokerConfig().getReviveAckWaitMs(); @@ -482,6 +609,17 @@ private boolean mockCkForAck(MessageExt messageExt, AckMsg ackMsg, String mergeK return false; } + /** + * Build a synthetic checkpoint from an ack message. + * + *

The mock CK has {@code num = 0} and empty bitMap, meaning no actual + * messages to revive. Its only purpose is to carry the {@code reviveOffset} + * so that the revive consumer offset can be committed past this ack. + * + * @param ackMsg the ack message + * @param reviveOffset the queue offset of the ack message in the revive topic + * @return a mock checkpoint with no sub-messages + */ private PopCheckPoint createMockCkForAck(AckMsg ackMsg, long reviveOffset) { PopCheckPoint point = new PopCheckPoint(); point.setStartOffset(ackMsg.getStartOffset()); @@ -496,7 +634,26 @@ private PopCheckPoint createMockCkForAck(AckMsg ackMsg, long reviveOffset) { return point; } + /** + * Process collected checkpoints and revive all un-acked sub-messages. + * + *

Checkpoints are sorted by revive offset. For each one: + *

    + *
  • Skip if the revive time has not yet elapsed (within + * {@code ackTimeInterval + 1s} of {@code endTime})
  • + *
  • Skip if the normal topic or consumer group no longer exists
  • + *
  • Wait if too many revives are already in-flight (max 3)
  • + *
  • Call {@link #reviveMsgFromCk} to re-publish un-acked messages
  • + *
+ * + *

After processing, the revive topic offset is advanced past all + * processed checkpoints. + * + * @param consumeReviveObj the container with collected CKs and scan state + * @throws Throwable if any revive operation fails + */ protected void mergeAndRevive(ConsumeReviveObj consumeReviveObj) throws Throwable { + // sort checkpoints and init newOffset ArrayList sortList = consumeReviveObj.genSortList(); POP_LOGGER.info("reviveQueueId={}, ck listSize={}", queueId, sortList.size()); if (sortList.size() != 0) { @@ -504,6 +661,7 @@ protected void mergeAndRevive(ConsumeReviveObj consumeReviveObj) throws Throwabl sortList.get(0).getReviveOffset(), sortList.get(sortList.size() - 1).getStartOffset(), sortList.get(sortList.size() - 1).getReviveOffset()); } long newOffset = consumeReviveObj.oldOffset; + for (PopCheckPoint popCheckPoint : sortList) { if (!shouldRunPopRevive) { POP_LOGGER.info("slave skip ck process, revive topic={}, reviveQueueId={}", reviveTopic, queueId); @@ -526,11 +684,14 @@ protected void mergeAndRevive(ConsumeReviveObj consumeReviveObj) throws Throwabl continue; } + // Concurrency control for revive: skip first long-running revive task. while (inflightReviveRequestMap.size() > 3) { waitForRunning(100); Pair pair = inflightReviveRequestMap.firstEntry().getValue(); + // if first revive task is timeout, reput it to revive topic, then skip if (!pair.getObject2() && System.currentTimeMillis() - pair.getObject1() > 1000 * 30) { PopCheckPoint oldCK = inflightReviveRequestMap.firstKey(); + // reput checkpoint to revive topic rePutCK(oldCK, pair); inflightReviveRequestMap.remove(oldCK); POP_LOGGER.warn("stay too long, remove from reviveRequestMap, {}, {}, {}, {}", popCheckPoint.getTopic(), @@ -538,10 +699,12 @@ protected void mergeAndRevive(ConsumeReviveObj consumeReviveObj) throws Throwabl } } + // revive message reviveMsgFromCk(popCheckPoint); - newOffset = popCheckPoint.getReviveOffset(); } + + // commit offset if (newOffset > consumeReviveObj.oldOffset) { if (!shouldRunPopRevive) { POP_LOGGER.info("slave skip commit, revive topic={}, reviveQueueId={}", reviveTopic, queueId); @@ -553,22 +716,46 @@ protected void mergeAndRevive(ConsumeReviveObj consumeReviveObj) throws Throwabl consumeReviveObj.newOffset = newOffset; } + /** + * Revive all un-acked sub-messages in a checkpoint: + * - reput message to revive topic + * - put message to retry topic + * + *

For each sub-message whose bit is not set in the bitMap, the original + * message is fetched via {@link #getBizMessage} and re-published to the + * retry topic via {@link #reviveRetry}. All revive attempts run + * concurrently via {@link CompletableFuture#allOf}. + * + *

After all attempts complete: + *

    + *
  • Failed offsets are re-queued via {@link #rePutCK}
  • + *
  • The {@link #inflightReviveRequestMap} is updated and completed + * entries are removed in order, advancing the revive offset
  • + *
+ * + * @param popCheckPoint the checkpoint whose un-acked messages should be revived + */ private void reviveMsgFromCk(PopCheckPoint popCheckPoint) { + // env check and init if (!shouldRunPopRevive) { POP_LOGGER.info("slave skip retry, revive topic={}, reviveQueueId={}", reviveTopic, queueId); return; } inflightReviveRequestMap.put(popCheckPoint, new Pair<>(System.currentTimeMillis(), false)); List>> futureList = new ArrayList<>(popCheckPoint.getNum()); + + // put message to retry topic if checkpoint was not acked for (int j = 0; j < popCheckPoint.getNum(); j++) { + // if checkpoint was acked, skip if (DataConverter.getBit(popCheckPoint.getBitMap(), j)) { continue; } - // retry msg + // get message by checkpoint, then put message to retry topic long msgOffset = popCheckPoint.ackOffsetByIndex((byte) j); CompletableFuture> future = getBizMessage(popCheckPoint, msgOffset) .thenApply(rst -> { + // validate message MessageExt message = rst.getLeft(); if (message == null) { POP_LOGGER.info("reviveQueueId={}, can not get biz msg, topic:{}, qid:{}, offset:{}, brokerName:{}, info:{}, retry:{}, then continue", @@ -580,8 +767,11 @@ private void reviveMsgFromCk(PopCheckPoint popCheckPoint) { }); futureList.add(future); } + + // reput checkpoint to revive topic if retry failed CompletableFuture.allOf(futureList.toArray(new CompletableFuture[0])) .whenComplete((v, e) -> { + // reput checkpoint for (CompletableFuture> future : futureList) { Pair pair = future.getNow(new Pair<>(0L, false)); if (!pair.getObject2()) { @@ -589,9 +779,12 @@ private void reviveMsgFromCk(PopCheckPoint popCheckPoint) { } } + // update ack status of inflight checkpoint if (inflightReviveRequestMap.containsKey(popCheckPoint)) { inflightReviveRequestMap.get(popCheckPoint).setObject2(true); } + + // commit offset and remove inflight checkpoint for (Map.Entry> entry : inflightReviveRequestMap.entrySet()) { PopCheckPoint oldCK = entry.getKey(); Pair pair = entry.getValue(); @@ -605,6 +798,24 @@ private void reviveMsgFromCk(PopCheckPoint popCheckPoint) { }); } + /** + * Re-write a checkpoint to the revive topic after a failed revive attempt. + * + *

When a sub-message cannot be revived (e.g. the original message is + * temporarily unavailable), the CK is re-published with: + *

    + *
  • A single sub-message targeting the failed offset
  • + *
  • An increased {@code rePutTimes} and an extended invisible time + * based on the backoff interval
  • + *
  • A cleared bitMap, so the next revive cycle will retry it
  • + *
+ * + *

If {@code rePutTimes} exceeds the backoff table length and + * {@code skipWhenCKRePutReachMaxTimes} is set, the CK is dropped. + * + * @param oldCK the original checkpoint that failed to revive + * @param pair the failed offset and result (object1 = offset, object2 = result) + */ private void rePutCK(PopCheckPoint oldCK, Pair pair) { int rePutTimes = oldCK.parseRePutTimes(); if (rePutTimes >= ckRewriteIntervalsInSeconds.length && brokerController.getBrokerConfig().isSkipWhenCKRePutReachMaxTimes()) { @@ -654,11 +865,27 @@ public long getReviveBehindMessages() throws ConsumeQueueException { return Math.max(0, diff); } + /** + * Main loop: periodically consume revive messages and revive timed-out CKs. + * + *

Each iteration: + *

    + *
  1. Waits for {@code reviveInterval} (configurable)
  2. + *
  3. Calls {@link #consumeReviveMessage} to scan the revive topic and + * merge checkpoints with their corresponding AckMsg
  4. + *
  5. Calls {@link #mergeAndRevive} to re-publish all un-acked + * sub-messages whose revive time has elapsed
  6. + *
  7. If no checkpoints were processed, increases a {@code slow} counter and + * sleeps longer — the idle interval ramps up to + * {@code reviveMaxSlow * reviveInterval}
  8. + *
+ */ @Override public void run() { int slow = 1; while (!this.isStopped()) { try { + // env check if (System.currentTimeMillis() < brokerController.getShouldStartTime()) { POP_LOGGER.info("PopReviveService Ready to run after {}", brokerController.getShouldStartTime()); this.waitForRunning(1000); @@ -676,6 +903,8 @@ public void run() { } POP_LOGGER.info("start revive topic={}, reviveQueueId={}", reviveTopic, queueId); + + // consume revive message ConsumeReviveObj consumeReviveObj = new ConsumeReviveObj(); consumeReviveMessage(consumeReviveObj); @@ -684,8 +913,10 @@ public void run() { continue; } + // merge checkpoint and ackMsg then revive mergeAndRevive(consumeReviveObj); + // wait and logging ArrayList sortList = consumeReviveObj.sortList; long delay = 0; if (sortList != null && !sortList.isEmpty()) { diff --git a/broker/src/main/java/org/apache/rocketmq/broker/processor/SendMessageProcessor.java b/broker/src/main/java/org/apache/rocketmq/broker/processor/SendMessageProcessor.java index 5f5671fb7a0..d3cbee08225 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/processor/SendMessageProcessor.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/processor/SendMessageProcessor.java @@ -93,6 +93,7 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, case RequestCode.CONSUMER_SEND_MSG_BACK: return this.consumerSendMsgBack(ctx, request); default: + // build send message context SendMessageRequestHeader requestHeader = parseRequestHeader(request); if (requestHeader == null) { return null; @@ -103,6 +104,8 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, return rewriteResult; } sendMessageContext = buildMsgContext(ctx, requestHeader, request); + + // execute send message hook before try { this.executeSendMessageHookBefore(sendMessageContext); } catch (AbortProcessException e) { @@ -114,7 +117,7 @@ public RemotingCommand processRequest(ChannelHandlerContext ctx, RemotingCommand response; clearReservedProperties(requestHeader); - if (requestHeader.isBatch()) { + if (requestHeader.isBatch()) { // no batch message after 5.0 response = this.sendBatchMessage(ctx, request, sendMessageContext, requestHeader, mappingContext, (ctx1, response1) -> executeSendMessageHookAfter(response1, ctx1)); } else { @@ -321,9 +324,12 @@ public RemotingCommand sendMessage(final ChannelHandlerContext ctx, // Map oriProps = MessageDecoder.string2messageProperties(requestHeader.getProperties()); String traFlag = oriProps.get(MessageConst.PROPERTY_TRANSACTION_PREPARED); + // sendTransactionPrepareMessage is true, when traFlag is true, after version 4.6.1 boolean sendTransactionPrepareMessage; if (Boolean.parseBoolean(traFlag) + // For client under version 4.6.1, exclude retry message with delay level. && !(msgInner.getReconsumeTimes() > 0 && msgInner.getDelayTimeLevel() > 0)) { //For client under version 4.6.1 + // default value of rejectTransactionMessage is false if (this.brokerController.getBrokerConfig().isRejectTransactionMessage()) { response.setCode(ResponseCode.NO_PERMISSION); response.setRemark( @@ -338,6 +344,7 @@ public RemotingCommand sendMessage(final ChannelHandlerContext ctx, long beginTimeMillis = this.brokerController.getMessageStore().now(); + // default is true if (brokerController.getBrokerConfig().isAsyncSendEnable()) { CompletableFuture asyncPutMessageFuture; if (sendTransactionPrepareMessage) { diff --git a/broker/src/main/java/org/apache/rocketmq/broker/transaction/AbstractTransactionalMessageCheckListener.java b/broker/src/main/java/org/apache/rocketmq/broker/transaction/AbstractTransactionalMessageCheckListener.java index d1b77355b03..42b57ff4626 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/transaction/AbstractTransactionalMessageCheckListener.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/transaction/AbstractTransactionalMessageCheckListener.java @@ -48,7 +48,19 @@ public AbstractTransactionalMessageCheckListener(BrokerController brokerControll this.brokerController = brokerController; } + /** + * Send a transaction status check request to the producer that originated + * the half message. + * + *

Before sending, the message headers are restored to the real + * business topic and queue so the producer can identify the original + * message. The network request carries commitLogOffset, msgId, + * transactionId, and queue offset for the producer to look up the local + * transaction state. If the producer's channel is no longer connected, + * the check is skipped with a warning. + */ public void sendCheckMessage(MessageExt msgExt) throws Exception { + // format request header and message CheckTransactionStateRequestHeader checkTransactionStateRequestHeader = new CheckTransactionStateRequestHeader(); checkTransactionStateRequestHeader.setTopic(msgExt.getTopic()); checkTransactionStateRequestHeader.setCommitLogOffset(msgExt.getCommitLogOffset()); @@ -60,17 +72,33 @@ public void sendCheckMessage(MessageExt msgExt) throws Exception { msgExt.setTopic(msgExt.getUserProperty(MessageConst.PROPERTY_REAL_TOPIC)); msgExt.setQueueId(Integer.parseInt(msgExt.getUserProperty(MessageConst.PROPERTY_REAL_QUEUE_ID))); msgExt.setStoreSize(0); + + // find channel, channel can send message to client String groupId = msgExt.getProperty(MessageConst.PROPERTY_PRODUCER_GROUP); Channel channel = brokerController.getProducerManager().getAvailableChannel(groupId); + if (channel != null) { + // invoke channel.writeAndFlush() -> GrpcClientChannel.processCheckTransaction() brokerController.getBroker2Client().checkProducerTransactionState(groupId, channel, checkTransactionStateRequestHeader, msgExt); } else { LOGGER.warn("Check transaction failed, channel is null. groupId={}", groupId); } } + /** + * Asynchronously contact the producer to check the status of a half + * message (commit, rollback, or unknown). + * + *

This is invoked by the transaction check loop when a half message + * has aged past its immunity window without an OP record. The actual + * network request is dispatched to the dedicated + * {@code Transaction-msg-check-thread} pool so the caller is not blocked. + * If the pool is full, the {@code CallerRunsPolicy} will execute the task + * on the caller thread, applying back-pressure to the check loop. + */ public void resolveHalfMsg(final MessageExt msgExt) { if (executorService != null) { + // executorService thread pool(2~5 threads) executorService.execute(new Runnable() { @Override public void run() { diff --git a/broker/src/main/java/org/apache/rocketmq/broker/transaction/TransactionalMessageCheckService.java b/broker/src/main/java/org/apache/rocketmq/broker/transaction/TransactionalMessageCheckService.java index 52209c3fbdb..33c75166256 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/transaction/TransactionalMessageCheckService.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/transaction/TransactionalMessageCheckService.java @@ -22,6 +22,16 @@ import org.apache.rocketmq.logging.org.slf4j.Logger; import org.apache.rocketmq.logging.org.slf4j.LoggerFactory; +/** + * Background service that periodically triggers transaction status checks. + * only one public method: run, and calls TransactionalMessageService.check() + * + *

Runs at a configurable interval ({@code transactionCheckInterval}). Each + * iteration calls + * {@link TransactionalMessageService#check(long, int, AbstractTransactionalMessageCheckListener)} + * which scans the half-message topic for unresolved transactions and either + * initiates a broker-side check-back or discards expired messages. + */ public class TransactionalMessageCheckService extends ServiceThread { private static final Logger log = LoggerFactory.getLogger(LoggerName.TRANSACTION_LOGGER_NAME); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/DefaultTransactionalMessageCheckListener.java b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/DefaultTransactionalMessageCheckListener.java index 6770561823f..aac3addcfbe 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/DefaultTransactionalMessageCheckListener.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/DefaultTransactionalMessageCheckListener.java @@ -37,6 +37,15 @@ public DefaultTransactionalMessageCheckListener() { super(); } + /** + * Move a half message to the dead-letter system topic + * {@code TRANS_CHECK_MAXTIME_TOPIC} when it has been checked too many + * times without a definitive commit or rollback. + * + *

Once moved, the message is no longer tracked by the transaction check + * loop and will never be delivered to the consumer. This prevents endless + * rechecking of messages whose producer is permanently unable to respond. + */ @Override public void resolveDiscardMsg(MessageExt msgExt) { log.error("MsgExt:{} has been checked too many times, so discard it by moving it to system topic TRANS_CHECK_MAXTIME_TOPIC", msgExt); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/MessageQueueOpContext.java b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/MessageQueueOpContext.java index e8e5f13de6b..1057749e5a3 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/MessageQueueOpContext.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/MessageQueueOpContext.java @@ -19,11 +19,25 @@ import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.atomic.AtomicInteger; +/** + * Write buffer for transaction OP (operation) queue offsets. + * + *

In the transaction message flow, commit/rollback operations produce OP + * records that are written to the {@code RMQ_SYS_TRANS_OP_HALF_TOPIC}. + * Instead of writing each OP individually, offsets are buffered here and + * flushed in batches to reduce I/O. + * + *

The {@link #contextQueue} holds batched offset strings, while + * {@link #totalSize} tracks the accumulated count and + * {@link #lastWriteTimestamp} controls flush timing. + */ public class MessageQueueOpContext { private AtomicInteger totalSize = new AtomicInteger(0); private volatile long lastWriteTimestamp; + // offset1, offset2, offsetN, ... private LinkedBlockingQueue contextQueue; + // queueLength is 20000, hard coded. public MessageQueueOpContext(long timestamp, int queueLength) { this.lastWriteTimestamp = timestamp; contextQueue = new LinkedBlockingQueue(queueLength); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageBridge.java b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageBridge.java index 47e453946d7..4e69f57e35e 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageBridge.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageBridge.java @@ -213,24 +213,54 @@ public PutMessageResult putHalfMessage(MessageExtBrokerInner messageInner) { } public CompletableFuture asyncPutHalfMessage(MessageExtBrokerInner messageInner) { + // store is DefaultMessageStore return store.asyncPutMessage(parseHalfMessageInner(messageInner)); } + /** + * Transform a transactional message into a half message and redirect it to + * the half-message topic. + * + *

The method: + *

    + *
  1. Copies the client message ID as the transaction ID for later + * checkpoint lookup
  2. + *
  3. Preserves the original topic and queue as properties so that + * they can be restored when the transaction commits
  4. + *
  5. Clears the transaction sys-flag to prevent re-interception
  6. + *
  7. Redirects the message to {@code RMQ_SYS_TRANS_HALF_TOPIC} + * or {@code RMQ_SYS_ROCKSDB_TRANS_HALF_TOPIC} depending on config
  8. + *
+ * + * @param msgInner the original transactional message + * @return the transformed half message + */ private MessageExtBrokerInner parseHalfMessageInner(MessageExtBrokerInner msgInner) { + // set transactionId String uniqId = msgInner.getUserProperty(MessageConst.PROPERTY_UNIQ_CLIENT_MESSAGE_ID_KEYIDX); if (uniqId != null && !uniqId.isEmpty()) { MessageAccessor.putProperty(msgInner, TransactionalMessageUtil.TRANSACTION_ID, uniqId); } + + // store real topic and queueId to properties MessageAccessor.putProperty(msgInner, MessageConst.PROPERTY_REAL_TOPIC, msgInner.getTopic()); MessageAccessor.putProperty(msgInner, MessageConst.PROPERTY_REAL_QUEUE_ID, String.valueOf(msgInner.getQueueId())); + + // Clears the transaction sys-flag to prevent re-interception msgInner.setSysFlag( MessageSysFlag.resetTransactionValue(msgInner.getSysFlag(), MessageSysFlag.TRANSACTION_NOT_TYPE)); + + // set transactional topic + // 1. TopicValidator.RMQ_SYS_ROCKSDB_TRANS_HALF_TOPIC if rocksdb enable + // 2. TopicValidator.RMQ_SYS_TRANS_HALF_TOPIC if (null != store.getMessageStoreConfig() && store.getMessageStoreConfig().isTransRocksDBEnable() && !store.getMessageStoreConfig().isTransWriteOriginTransHalfEnable()) { msgInner.setTopic(TransactionalMessageUtil.buildHalfTopicForRocksDB()); } else { msgInner.setTopic(TransactionalMessageUtil.buildHalfTopic()); } + + // set queueId and propertiesString msgInner.setQueueId(0); msgInner.setPropertiesString(MessageDecoder.messageProperties2String(msgInner.getProperties())); return msgInner; @@ -260,6 +290,19 @@ public boolean putMessage(MessageExtBrokerInner messageInner) { } } + /** + * Renew a half message and set property MessageConst.PROPERTY_TRANSACTION_PREPARED_QUEUE_OFFSET if not exists + * + *

This is used when re-putting a half message back to the HALF topic + * during the transaction check-back process. The + * {@code PROPERTY_TRANSACTION_PREPARED_QUEUE_OFFSET} property records the + * original queue offset so that later checks can determine whether the + * producer has already committed or rolled back the message within the + * immunity window. + * + * @param msgExt the original half message + * @return a new half message with the prepared queue offset preserved + */ public MessageExtBrokerInner renewImmunityHalfMessageInner(MessageExt msgExt) { MessageExtBrokerInner msgInner = renewHalfMessageInner(msgExt); String queueOffsetFromPrepare = msgExt.getUserProperty(MessageConst.PROPERTY_TRANSACTION_PREPARED_QUEUE_OFFSET); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageServiceImpl.java b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageServiceImpl.java index 2f05bee0040..9e3d46653f3 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageServiceImpl.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalMessageServiceImpl.java @@ -67,6 +67,10 @@ public class TransactionalMessageServiceImpl implements TransactionalMessageServ private static final int SLEEP_WHILE_NO_OP = 1000; + /** + * deleted offset queue map + * only one key: 0 + */ private final ConcurrentHashMap deleteContext = new ConcurrentHashMap<>(); private ServiceThread transactionalOpBatchService; @@ -105,6 +109,20 @@ public PutMessageResult prepareMessage(MessageExtBrokerInner messageInner) { return transactionalMessageBridge.putHalfMessage(messageInner); } + /** + * Check whether the prepared message has exceeded the maximum number of + * transaction checks and should be discarded. + * isOverMaxCheckTimes may be a better method name + * + *

Each time the message is checked, the {@code TRANSACTION_CHECK_TIMES} + * property is incremented. When it reaches {@code transactionCheckMax}, + * the message is considered expired and will be discarded via + * {@link AbstractTransactionalMessageCheckListener#resolveDiscardMsg}. + * + * @param msgExt the prepared message being checked + * @param transactionCheckMax maximum allowed check attempts + * @return {@code true} if the message should be discarded + */ private boolean needDiscard(MessageExt msgExt, int transactionCheckMax) { String checkTimes = msgExt.getProperty(MessageConst.PROPERTY_TRANSACTION_CHECK_TIMES); int checkTime = 1; @@ -120,6 +138,19 @@ private boolean needDiscard(MessageExt msgExt, int transactionCheckMax) { return false; } + /** + * Check whether the prepared message should be skipped because its birth + * time exceeds the commit log's file reserved time. + * isExpired maybe a better method name + * + *

If the message has been in the half topic longer than + * {@code fileReservedTime} hours, the corresponding commit log data may + * have already been deleted. The message is skipped rather than checked + * to avoid unnecessary IO and potential errors. + * + * @param msgExt the prepared message being checked + * @return {@code true} if the message should be skipped + */ private boolean needSkip(MessageExt msgExt) { long valueOfCurrentMinusBorn = System.currentTimeMillis() - msgExt.getBornTimestamp(); if (valueOfCurrentMinusBorn @@ -158,10 +189,37 @@ private boolean putBackHalfMsgQueue(MessageExt msgExt, long offset) { } } + /** + * Scan the half-message topic and compare with the OP topic to find + * unresolved transactions. called by independent thread(TransactionalMessageCheckService). + * + *

For each queue in the HALF topic: + *

    + *
  1. Fetches the OP (operation) topic at the current offset and builds + * a {@code removeMap} — half offsets that have been committed or + * rolled back
  2. + *
  3. Iterates through the HALF topic: + *
      + *
    • If the offset is in {@code removeMap} → skip (already done)
    • + *
    • If the message is outside the immunity window → calls + * {@code listener.resolveHalfMsg} to trigger a broker-side + * check-back to the producer
    • + *
    • If the message exceeds check max or file-reserved time → + * {@code listener.resolveDiscardMsg}
    • + *
    + *
  4. + *
  5. Updates consume offsets for both HALF and OP topics
  6. + *
+ * + * @param transactionTimeout the transaction timeout in milliseconds + * @param transactionCheckMax maximum number of times to check a transaction + * @param listener callback for resolved or discarded messages + */ @Override public void check(long transactionTimeout, int transactionCheckMax, AbstractTransactionalMessageCheckListener listener) { try { + // fetch message queues of the half-message topic, one queue by default String topic = TopicValidator.RMQ_SYS_TRANS_HALF_TOPIC; Set msgQueues = transactionalMessageBridge.fetchMessageQueues(topic); if (msgQueues == null || msgQueues.size() == 0) { @@ -169,7 +227,10 @@ public void check(long transactionTimeout, int transactionCheckMax, return; } log.debug("Check topic={}, queues={}", topic, msgQueues); + + // loop through each prepare queue, one queue by default for (MessageQueue messageQueue : msgQueues) { + // init context: opQueue, offsets, etc long startTime = System.currentTimeMillis(); MessageQueue opQueue = getOpQueue(messageQueue); long halfOffset = transactionalMessageBridge.fetchConsumeOffset(messageQueue); @@ -181,15 +242,29 @@ public void check(long transactionTimeout, int transactionCheckMax, continue; } + // opOffset list for which: + // - message body is null or empty + // - all corresponding prepareOffsets < miniOffset + // - all corresponding prepareOffsets have been committed/rolled back List doneOpOffset = new ArrayList<>(); + // the relation between: + // - prepareOffset : opOffset = N:1 + // - prepareOffset : prepareMessage = 1:1 + // Map HashMap removeMap = new HashMap<>(); + // Map> + // This match the storage format of the OP topic queue HashMap> opMsgMap = new HashMap>(); + + // load op message to removeMap PullResult pullResult = fillOpRemoveMap(removeMap, opQueue, opOffset, halfOffset, opMsgMap, doneOpOffset); if (null == pullResult) { log.error("The queue={} check msgOffset={} with opOffset={} failed, pullResult is null", messageQueue, halfOffset, opOffset); continue; } + + // init merge prepare queue and op queue context // single thread int getMessageNullCount = 1; long newOffset = halfOffset; @@ -204,6 +279,8 @@ public void check(long transactionTimeout, int transactionCheckMax, break; } Long removedOpOffset; + + // remove committed/rolled back message if ((removedOpOffset = removeMap.remove(i)) != null) { log.debug("Half offset {} has been committed/rolled back", i); opMsgMap.get(removedOpOffset).remove(i); @@ -211,7 +288,10 @@ public void check(long transactionTimeout, int transactionCheckMax, opMsgMap.remove(removedOpOffset); doneOpOffset.add(removedOpOffset); } - } else { + } + // merge prepare message and op message + else { + // get one prepare message GetResult getResult = getHalfMsg(messageQueue, i); MessageExt msgExt = getResult.getMsg(); if (msgExt == null) { @@ -231,6 +311,7 @@ public void check(long transactionTimeout, int transactionCheckMax, } } + // slave acting master mode if (this.transactionalMessageBridge.getBrokerController().getBrokerConfig().isEnableSlaveActingMaster() && this.transactionalMessageBridge.getBrokerController().getMinBrokerIdInGroup() == this.transactionalMessageBridge.getBrokerController().getBrokerIdentity().getBrokerId() @@ -260,24 +341,30 @@ public void check(long transactionTimeout, int transactionCheckMax, continue; } + // if isOverMaxCheckTimes or isExpired, call client to check transaction status if (needDiscard(msgExt, transactionCheckMax) || needSkip(msgExt)) { listener.resolveDiscardMsg(msgExt); newOffset = i + 1; i++; continue; } + + // skip if the message is fresh, break if (msgExt.getStoreTimestamp() >= startTime) { log.debug("Fresh stored. the miss offset={}, check it later, store={}", i, new Date(msgExt.getStoreTimestamp())); break; } + // check ImmunityTime, skip if bornTime <= immunityTime, reput to prepare queue if needed long valueOfCurrentMinusBorn = System.currentTimeMillis() - msgExt.getBornTimestamp(); long checkImmunityTime = transactionTimeout; String checkImmunityTimeStr = msgExt.getUserProperty(MessageConst.PROPERTY_CHECK_IMMUNITY_TIME_IN_SECONDS); if (null != checkImmunityTimeStr) { + // convert checkImmunityTimeStr to long, if failed, use transactionTimeout checkImmunityTime = getImmunityTime(checkImmunityTimeStr, transactionTimeout); if (valueOfCurrentMinusBorn <= checkImmunityTime) { + // check the prepare message has been committed/rolled back if (checkPrepareQueueOffset(removeMap, doneOpOffset, msgExt, checkImmunityTimeStr)) { newOffset = i + 1; i++; @@ -291,13 +378,18 @@ public void check(long transactionTimeout, int transactionCheckMax, break; } } + + // calculate isNeedCheck List opMsg = pullResult == null ? null : pullResult.getMsgFoundList(); + // isNeedCheck is true if: + // - no opMessage && bornTime > transactionTimeout + // - have opMessage && lastOpMessage.bornTime > transactionTimeout + // - bornTime before now boolean isNeedCheck = opMsg == null && valueOfCurrentMinusBorn > checkImmunityTime || opMsg != null && opMsg.get(opMsg.size() - 1).getBornTimestamp() - startTime > transactionTimeout || valueOfCurrentMinusBorn <= -1; if (isNeedCheck) { - if (!putBackHalfMsgQueue(msgExt, i)) { continue; } @@ -307,7 +399,7 @@ public void check(long transactionTimeout, int transactionCheckMax, msgExt.getUserProperty(MessageConst.PROPERTY_UNIQ_CLIENT_MESSAGE_ID_KEYIDX), msgExt.getQueueOffset(), msgExt.getCommitLogOffset()); listener.resolveHalfMsg(msgExt); - } else { + } else { // fetch more opMessages and sleep if needed nextOpOffset = pullResult != null ? pullResult.getNextBeginOffset() : nextOpOffset; pullResult = fillOpRemoveMap(removeMap, opQueue, nextOpOffset, halfOffset, opMsgMap, doneOpOffset); @@ -330,6 +422,8 @@ public void check(long transactionTimeout, int transactionCheckMax, newOffset = i + 1; i++; } + + // update new offsets and logging if (newOffset != halfOffset) { transactionalMessageBridge.updateConsumeOffset(messageQueue, newOffset); } @@ -337,6 +431,7 @@ public void check(long transactionTimeout, int transactionCheckMax, if (newOpOffset != opOffset) { transactionalMessageBridge.updateConsumeOffset(opQueue, newOpOffset); } + GetResult getResult = getHalfMsg(messageQueue, newOffset); pullResult = pullOpMsg(opQueue, newOpOffset, 1); long maxMsgOffset = getResult.getPullResult() == null ? newOffset : getResult.getPullResult().getMaxOffset(); @@ -350,9 +445,22 @@ public void check(long transactionTimeout, int transactionCheckMax, } catch (Throwable e) { log.error("Check error", e); } - } + /** + * convert checkImmunityTimeStr to long, return transactionTimeout if invalid. + * + *

The immunity time is the minimum duration the broker must wait + * before initiating a transaction status check-back to the producer. + * If the producer specifies a custom value via + * {@code PROPERTY_CHECK_IMMUNITY_TIME_IN_SECONDS}, it is used (converted + * from seconds to millis). Otherwise, the default + * {@code transactionTimeout} is returned. + * + * @param checkImmunityTimeStr the custom immunity time string, may be null + * @param transactionTimeout the default transaction timeout + * @return the immunity time in milliseconds + */ private long getImmunityTime(String checkImmunityTimeStr, long transactionTimeout) { long checkImmunityTime; @@ -378,6 +486,7 @@ private long getImmunityTime(String checkImmunityTimeStr, long transactionTimeou */ private PullResult fillOpRemoveMap(HashMap removeMap, MessageQueue opQueue, long pullOffsetOfOp, long miniOffset, Map> opMsgMap, List doneOpOffset) { + // pull op messages(32 by default) PullResult pullResult = pullOpMsg(opQueue, pullOffsetOfOp, OP_MSG_PULL_NUMS); if (null == pullResult) { return null; @@ -398,18 +507,24 @@ private PullResult fillOpRemoveMap(HashMap removeMap, MessageQueue o log.warn("The miss op offset={} in queue={} is empty, pullResult={}", pullOffsetOfOp, opQueue, pullResult); return pullResult; } + for (MessageExt opMessageExt : opMsg) { + // add opOffset to doneOpOffset if body is null if (opMessageExt.getBody() == null) { log.error("op message body is null. queueId={}, offset={}", opMessageExt.getQueueId(), opMessageExt.getQueueOffset()); doneOpOffset.add(opMessageExt.getQueueOffset()); continue; } + + // format op message body HashSet set = new HashSet(); String queueOffsetBody = new String(opMessageExt.getBody(), TransactionalMessageUtil.CHARSET); log.debug("Topic: {} tags: {}, OpOffset: {}, HalfOffset: {}", opMessageExt.getTopic(), opMessageExt.getTags(), opMessageExt.getQueueOffset(), queueOffsetBody); + + // valid opMessage has tag: REMOVE_TAG if (TransactionalMessageUtil.REMOVE_TAG.equals(opMessageExt.getTags())) { String[] offsetArray = queueOffsetBody.split(TransactionalMessageUtil.OFFSET_SEPARATOR); for (String offset : offsetArray) { @@ -418,6 +533,7 @@ private PullResult fillOpRemoveMap(HashMap removeMap, MessageQueue o continue; } + // put prepareOffset, opOffset to removeMap removeMap.put(offsetValue, opMessageExt.getQueueOffset()); set.add(offsetValue); } @@ -425,13 +541,16 @@ private PullResult fillOpRemoveMap(HashMap removeMap, MessageQueue o log.error("Found a illegal tag in opMessageExt= {} ", opMessageExt); } + // put opOffset Set to opMsgMap if (set.size() > 0) { opMsgMap.put(opMessageExt.getQueueOffset(), set); } else { + // if all prepareOffset is done, add opOffset to doneOpOffset doneOpOffset.add(opMessageExt.getQueueOffset()); } } + // logging log.debug("Remove map: {}", removeMap); log.debug("Done op list: {}", doneOpOffset); log.debug("opMsg map: {}", opMsgMap); @@ -439,25 +558,44 @@ private PullResult fillOpRemoveMap(HashMap removeMap, MessageQueue o } /** - * If return true, skip this msg + * Check whether the prepared queue offset of a half message has been + * committed or rolled back during the immunity window. + * + *

There are three cases: + *

    + *
  • No {@code PROPERTY_TRANSACTION_PREPARED_QUEUE_OFFSET} — first time + * this message is checked. Re-put it back to the half topic with the + * offset attached so subsequent checks can skip it by offset.
  • + *
  • Offset found in {@code removeMap} — the producer has committed or + * rolled back via the OP queue. Remove from the pending map and mark as + * done.
  • + *
  • Offset not in {@code removeMap} — the producer has not responded + * yet. Re-queue for a future check.
  • + *
* * @param removeMap Op message map to determine whether a half message was responded by producer. * @param doneOpOffset Op Message which has been checked. * @param msgExt Half message - * @return Return true if put success, otherwise return false. + * @return true if the message can be skipped (completed or re-queued), + * false if the offset is illegal */ private boolean checkPrepareQueueOffset(HashMap removeMap, List doneOpOffset, MessageExt msgExt, String checkImmunityTimeStr) { String prepareQueueOffsetStr = msgExt.getUserProperty(MessageConst.PROPERTY_TRANSACTION_PREPARED_QUEUE_OFFSET); if (null == prepareQueueOffsetStr) { + // This message has never been checked by Rpc transaction-checker. + // We need re-put this message back to the end of the Half_Topic. + // so that we can skip the current offset(message) to check for the following message. + // PROPERTY_TRANSACTION_PREPARED_QUEUE_OFFSET will be added to the message by putImmunityMsgBackToHalfQueue. return putImmunityMsgBackToHalfQueue(msgExt); } else { long prepareQueueOffset = getLong(prepareQueueOffsetStr); - if (-1 == prepareQueueOffset) { + if (-1 == prepareQueueOffset) { // illegal offset return false; } else { Long tmpOpOffset; if ((tmpOpOffset = removeMap.remove(prepareQueueOffset)) != null) { + // message has been committed/rollback, remove it and add it to doneOpOffset doneOpOffset.add(tmpOpOffset); log.info("removeMap contain prepareQueueOffset. real_topic={},uniqKey={},immunityTime={},offset={}", msgExt.getUserProperty(MessageConst.PROPERTY_REAL_TOPIC), @@ -466,6 +604,8 @@ private boolean checkPrepareQueueOffset(HashMap removeMap, ListOffsets are first buffered in per-queue {@link MessageQueueOpContext} + * queues. When the buffer exceeds {@code transactionOpMsgMaxSize}, or when + * the buffer is full, they are flushed as a batch OP message containing + * multiple comma-separated offsets. If buffering fails entirely, a single + * OP message is written synchronously. + * + *

The transaction checker later reads the OP topic and skips any half + * message whose offset appears in the OP stream. + * + * @param messageExt the prepared (half) message to delete + * @return {@code true} if the OP record was written successfully + */ @Override public boolean deletePrepareMessage(MessageExt messageExt) { Integer queueId = messageExt.getQueueId(); MessageQueueOpContext mqContext = deleteContext.get(queueId); + + // init mq op context if not exist if (mqContext == null) { mqContext = new MessageQueueOpContext(System.currentTimeMillis(), 20000); MessageQueueOpContext old = deleteContext.putIfAbsent(queueId, mqContext); @@ -605,11 +770,17 @@ public boolean deletePrepareMessage(MessageExt messageExt) { } } + // the body of OP_Message is the offset of Half_Message + // every Half_Message store a lot of offset, split by comma + // default number of offset is 4096 String data = messageExt.getQueueOffset() + TransactionalMessageUtil.OFFSET_SEPARATOR; try { + // add offset to context queue boolean res = mqContext.getContextQueue().offer(data, 100, TimeUnit.MILLISECONDS); + // if offer succeed, wait for batch write if (res) { int totalSize = mqContext.getTotalSize().addAndGet(data.length()); + // default value of transactionOpMsgMaxSize is 4096 if (totalSize > transactionalMessageBridge.getBrokerController().getBrokerConfig().getTransactionOpMsgMaxSize()) { this.transactionalOpBatchService.wakeup(); } @@ -620,6 +791,7 @@ public boolean deletePrepareMessage(MessageExt messageExt) { } catch (InterruptedException ignore) { } + // if failed to enqueue offset to memory queue, write to OP topic Message msg = getOpMessage(queueId, data); if (this.transactionalMessageBridge.writeOp(queueId, msg)) { log.warn("Force add remove op data. queueId={}", queueId); @@ -653,6 +825,17 @@ public void close() { this.getTransactionMetrics().persist(); } + /** + * build op message with data in deleteContext.get(queueId) + * - topic: op_topic + * - tag: REMOVE_TAG + * - body: moreData(prepareOffset + ",") + * + prepareOffset in deleteContext.get(queueId) + * + * @param queueId prepare message queueId + * @param moreData prepare message offset list + * @return op message + */ public Message getOpMessage(int queueId, String moreData) { String opTopic = TransactionalMessageUtil.buildOpTopic(); MessageQueueOpContext mqContext = deleteContext.get(queueId); @@ -695,17 +878,36 @@ public Message getOpMessage(int queueId, String moreData) { return new Message(opTopic, TransactionalMessageUtil.REMOVE_TAG, sb.toString().getBytes(TransactionalMessageUtil.CHARSET)); } + + /** + * Flush buffered delete offsets for all queues to the OP topic. + * Called by independent thread(TransactionalOpBatchService) + * + *

Iterates over each per-queue {@link MessageQueueOpContext}. If the + * buffer has data and the time since the last write exceeds + * {@code transactionOpBatchInterval} (or the buffer is oversized), the + * buffered offsets are drained via {@link #getOpMessage}, combined into + * a single OP message, and written via + * {@link TransactionalMessageBridge#writeOp}. + * + *

Called by {@link TransactionalOpBatchService#onWaitEnd()}. + * + * @return the earliest wakeup timestamp for the next flush, or 0 if no + * waiting is needed + */ public long batchSendOpMessage() { - long startTime = System.currentTimeMillis(); try { + long startTime = System.currentTimeMillis(); long firstTimestamp = startTime; Map sendMap = null; + // default transactionOpBatchInterval is 3000 long interval = transactionalMessageBridge.getBrokerController().getBrokerConfig().getTransactionOpBatchInterval(); + // default transactionOpMsgMaxSize is 4096 int maxSize = transactionalMessageBridge.getBrokerController().getBrokerConfig().getTransactionOpMsgMaxSize(); boolean overSize = false; for (Map.Entry entry : deleteContext.entrySet()) { MessageQueueOpContext mqContext = entry.getValue(); - //no msg in contextQueue + // skip: no data or wait for the interval if (mqContext.getTotalSize().get() <= 0 || mqContext.getContextQueue().size() == 0 || // wait for the interval mqContext.getTotalSize().get() < maxSize && @@ -718,10 +920,14 @@ public long batchSendOpMessage() { sendMap = new HashMap<>(); } + // build op message with data in deleteContext.get(queueId) + // will build a message contains all offsets in deleteContext.get(queueId) + // it's better to pass mqContext as parameter Message opMsg = getOpMessage(entry.getKey(), null); if (opMsg == null) { continue; } + sendMap.put(entry.getKey(), opMsg); firstTimestamp = Math.min(firstTimestamp, mqContext.getLastWriteTimestamp()); if (mqContext.getTotalSize().get() >= maxSize) { diff --git a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalOpBatchService.java b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalOpBatchService.java index fb6e9e8ce1a..aa0b2d844a6 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalOpBatchService.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/transaction/queue/TransactionalOpBatchService.java @@ -22,6 +22,17 @@ import org.apache.rocketmq.logging.org.slf4j.Logger; import org.apache.rocketmq.logging.org.slf4j.LoggerFactory; +/** + * Background service that periodically flushes buffered OP (operation) messages + * to the {@code RMQ_SYS_TRANS_OP_HALF_TOPIC}. + * + *

Half-message delete offsets are accumulated in per-queue + * {@link MessageQueueOpContext} buffers. This service wakes up either when the + * buffer exceeds {@code transactionOpMsgMaxSize} or when the time-based + * {@code transactionOpBatchInterval} elapses, calling + * {@link TransactionalMessageServiceImpl#batchSendOpMessage()} to batch-write + * the buffered offsets as a single OP message. + */ public class TransactionalOpBatchService extends ServiceThread { private static final Logger LOGGER = LoggerFactory.getLogger(LoggerName.TRANSACTION_LOGGER_NAME); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/transaction/rocksdb/TransactionalMessageRocksDBService.java b/broker/src/main/java/org/apache/rocketmq/broker/transaction/rocksdb/TransactionalMessageRocksDBService.java index dbd3575d69c..372f04d5585 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/transaction/rocksdb/TransactionalMessageRocksDBService.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/transaction/rocksdb/TransactionalMessageRocksDBService.java @@ -136,23 +136,31 @@ private void checkTransRecordsStatus(List trs) { continue; } try { + // delete halfRecord if checkTimes > transactionCheckMax if (halfRecord.getCheckTimes() > brokerController.getBrokerConfig().getTransactionCheckMax()) { halfRecord.setDelete(true); updateList.add(halfRecord); log.info("TransactionalMessageRocksDBService checkTransRecordsStatus checkTimes > {}, need delete, checkTimes: {}, msgId: {}", brokerController.getBrokerConfig().getTransactionCheckMax(), halfRecord.getCheckTimes(), halfRecord.getUniqKey()); continue; } + MessageExt msgExt = transMessageRocksDBStore.getMessage(halfRecord.getOffsetPy(), halfRecord.getSizePy()); + // delete halfRecord if msgExt is null if (null == msgExt) { log.error("TransactionalMessageRocksDBService checkTransRecordsStatus, msgExt is null, offsetPy: {}, sizePy: {}", halfRecord.getOffsetPy(), halfRecord.getSizePy()); halfRecord.setDelete(true); updateList.add(halfRecord); continue; } + if (!isImmunityTimeExpired(msgExt)) { continue; } + + // call client to check transaction status resolveHalfMsg(msgExt); + + // update checkTimes halfRecord.setCheckTimes(halfRecord.getCheckTimes() + 1); if (halfRecord.getCheckTimes() > brokerController.getBrokerConfig().getTransactionCheckMax()) { halfRecord.setDelete(true); diff --git a/broker/src/main/java/org/apache/rocketmq/broker/util/HookUtils.java b/broker/src/main/java/org/apache/rocketmq/broker/util/HookUtils.java index 94be46ea405..ba3b2d1f891 100644 --- a/broker/src/main/java/org/apache/rocketmq/broker/util/HookUtils.java +++ b/broker/src/main/java/org/apache/rocketmq/broker/util/HookUtils.java @@ -38,9 +38,27 @@ import org.apache.rocketmq.store.PutMessageResult; import org.apache.rocketmq.store.PutMessageStatus; import org.apache.rocketmq.store.config.BrokerRole; +import org.apache.rocketmq.store.hook.SendMessageBackHook; import org.apache.rocketmq.store.queue.ConsumeQueueStoreInterface; import org.apache.rocketmq.store.timer.TimerMessageStore; +/** + * Pre-processing utilities invoked before putting a message to the store. + * + *

All methods are static and called sequentially from + * {@code SendMessageProcessor#asyncSendMessage}: + *

    + *
  1. {@link #checkBeforePutMessage} — validates store state, topic length, + * body presence, and OS page cache pressure
  2. + *
  3. {@link #checkInnerBatch} — checks inner-batch sysFlag consistency
  4. + *
  5. {@link #handleScheduleMessage} — routes timer and delay-level messages + * to {@code TIMER_TOPIC} or {@code SCHEDULE_TOPIC}
  6. + *
  7. {@link #handleLmqQuota} — enforces Light Message Queue limits
  8. + *
+ * + *

If any step returns a non-null {@link PutMessageResult}, the operation is + * aborted immediately. + */ public class HookUtils { protected static final Logger LOG = LoggerFactory.getLogger(LoggerName.BROKER_LOGGER_NAME); @@ -58,6 +76,13 @@ public class HookUtils { */ private static final Integer MAX_TOPIC_LENGTH = 255; + /** + * Pre-put message validation: guards against writes when the store is + * shut down, in slave mode (non-duplication), not writable, topic too long, + * body null, or OS page cache busy. + * + * @return null if the check passes, or a rejection {@link PutMessageResult} + */ public static PutMessageResult checkBeforePutMessage(BrokerController brokerController, final MessageExt msg) { if (brokerController.getMessageStore().isShutdown()) { LOG.warn("message store has shutdown, so putMessage is forbidden"); @@ -109,6 +134,14 @@ public static PutMessageResult checkBeforePutMessage(BrokerController brokerCont return null; } + /** + * Check inner-batch sysFlag consistency + * There is no inner-batch after v5.0.0 + * + * @param brokerController brokerController(object container) + * @param msg msg + * @return putMessageResult + */ public static PutMessageResult checkInnerBatch(BrokerController brokerController, final MessageExt msg) { if (msg.getProperties().containsKey(MessageConst.PROPERTY_INNER_NUM) && !MessageSysFlag.check(msg.getSysFlag(), MessageSysFlag.INNER_BATCH_FLAG)) { @@ -127,12 +160,31 @@ public static PutMessageResult checkInnerBatch(BrokerController brokerController return null; } + /** + * Route timer or delay-level messages to the appropriate system topic. + * + *

For non-transaction or committed messages, two checks run in order: + *

    + *
  1. Timer wheel — if the message carries timer properties + * ({@code PROPERTY_TIMER_DELIVER_MS}, etc.), it is transformed and + * redirected to {@code TIMER_TOPIC}. The TimerWheel must be enabled, + * otherwise the message is rejected.
  2. + *
  3. Delay level — if {@code delayTimeLevel > 0}, the message + * is redirected to {@code SCHEDULE_TOPIC_XXXX}. Both checks can apply + * to the same message (legacy bridge).
  4. + *
+ * + * @return non-null {@link PutMessageResult} if the message was rejected + */ public static PutMessageResult handleScheduleMessage(BrokerController brokerController, final MessageExtBrokerInner msg) { final int tranType = MessageSysFlag.getTransactionValue(msg.getSysFlag()); + // normal message or committed message can be delayed if (tranType == MessageSysFlag.TRANSACTION_NOT_TYPE || tranType == MessageSysFlag.TRANSACTION_COMMIT_TYPE) { + // is timer topic if (!isRolledTimerMessage(msg)) { + // double check, has delay level or, is timer topic and has delivery time if (checkIfTimerMessage(msg)) { if (!brokerController.getMessageStoreConfig().isTimerWheelEnable()) { //wheel timer is not enabled, reject the message @@ -144,7 +196,7 @@ public static PutMessageResult handleScheduleMessage(BrokerController brokerCont } } } - // Delay Delivery + // Delay Delivery, useless with default config if (msg.getDelayTimeLevel() > 0) { transformDelayLevelMessage(brokerController, msg); } @@ -152,6 +204,13 @@ public static PutMessageResult handleScheduleMessage(BrokerController brokerCont return null; } + /** + * Enforce Light Message Queue (LMQ) quota: reject the message if the + * number of LMQ consume queues would exceed the configured maximum and + * the target queue does not already exist. + * + * @return null if the check passes, or a rejection {@link PutMessageResult} + */ public static PutMessageResult handleLmqQuota(BrokerController brokerController, final MessageExtBrokerInner msg) { if (!brokerController.getMessageStoreConfig().isEnableLmqQuota() || !brokerController.getMessageStoreConfig().isEnableLmq() @@ -201,10 +260,34 @@ public static boolean checkIfTimerMessage(MessageExtBrokerInner msg) { return null != msg.getProperty(MessageConst.PROPERTY_TIMER_DELIVER_MS) || null != msg.getProperty(MessageConst.PROPERTY_TIMER_DELAY_MS) || null != msg.getProperty(MessageConst.PROPERTY_TIMER_DELAY_SEC); } + /** + * Transform a timer message and redirect it to the timer wheel topic. + * + *

Parses the delivery time from {@code PROPERTY_TIMER_DELAY_SEC}, + * {@code PROPERTY_TIMER_DELAY_MS}, or {@code PROPERTY_TIMER_DELIVER_MS}. + * The time is aligned to {@code timerPrecisionMs} boundaries to match + * the TimerWheel tick resolution. + * + *

The original topic and queue are saved as properties. + * topic was changed to {@link TimerMessageStore#TIMER_TOPIC}, + * queue was changed to 0 + * + *

Rejection conditions: + *

    + *
  • Non-delay-level messages exceeding {@code timerMaxDelaySec}
  • + *
  • TimerWheel slot congestion ({@link TimerMessageStore#isReject})
  • + *
+ * + * @param brokerController the broker controller + * @param msg the message to transform + * @return a non-null {@link PutMessageResult} if the message is rejected + */ private static PutMessageResult transformTimerMessage(BrokerController brokerController, MessageExtBrokerInner msg) { //do transform int delayLevel = msg.getDelayTimeLevel(); + + // calculate deliver time long deliverMs; try { if (msg.getProperty(MessageConst.PROPERTY_TIMER_DELAY_SEC) != null) { @@ -217,21 +300,28 @@ private static PutMessageResult transformTimerMessage(BrokerController brokerCon } catch (Exception e) { return new PutMessageResult(PutMessageStatus.WHEEL_TIMER_MSG_ILLEGAL, null); } + if (deliverMs > System.currentTimeMillis()) { + // default value of timerMaxDelaySec is 3600 * 24 * 3 if (delayLevel <= 0 && deliverMs - System.currentTimeMillis() > brokerController.getMessageStoreConfig().getTimerMaxDelaySec() * 1000L) { return new PutMessageResult(PutMessageStatus.WHEEL_TIMER_MSG_ILLEGAL, null); } + // precision operation int timerPrecisionMs = brokerController.getMessageStoreConfig().getTimerPrecisionMs(); if (deliverMs % timerPrecisionMs == 0) { + // Exactly on boundary → move one tick earlier deliverMs -= timerPrecisionMs; } else { + // Not on boundary → round down to nearest tick deliverMs = deliverMs / timerPrecisionMs * timerPrecisionMs; } + // flow control, always skip with default config if (brokerController.getTimerMessageStore().isReject(deliverMs)) { return new PutMessageResult(PutMessageStatus.WHEEL_TIMER_FLOW_CONTROL, null); } + MessageAccessor.putProperty(msg, MessageConst.PROPERTY_TIMER_OUT_MS, deliverMs + ""); MessageAccessor.putProperty(msg, MessageConst.PROPERTY_REAL_TOPIC, msg.getTopic()); MessageAccessor.putProperty(msg, MessageConst.PROPERTY_REAL_QUEUE_ID, String.valueOf(msg.getQueueId())); @@ -259,6 +349,14 @@ public static void transformDelayLevelMessage(BrokerController brokerController, msg.setQueueId(ScheduleMessageService.delayLevel2QueueId(msg.getDelayTimeLevel())); } + /** + * Forward messages to another broker (typically the retry / dead-letter + * queue destination). Used as the {@link SendMessageBackHook} implementation. + * + *

Each message is sent with {@code waitStoreMsgOK=false} and a 3s timeout. + * Messages are removed from the list on success; on any failure the entire + * batch is aborted and {@code false} is returned. + */ public static boolean sendMessageBack(BrokerController brokerController, List msgList, String brokerName, String brokerAddr) { try { diff --git a/common/src/main/java/org/apache/rocketmq/common/BrokerConfig.java b/common/src/main/java/org/apache/rocketmq/common/BrokerConfig.java index 38644659e10..b6f34fb9576 100644 --- a/common/src/main/java/org/apache/rocketmq/common/BrokerConfig.java +++ b/common/src/main/java/org/apache/rocketmq/common/BrokerConfig.java @@ -243,6 +243,11 @@ public class BrokerConfig extends BrokerIdentity { private int popFromRetryProbabilityForPriority = 0; // 0 as the lowest priority if true private boolean priorityOrderAsc = true; + /** + * There are two types of ack mode: + * 1. ack by file system service, which is the default mode. + * 2. ack by key-value service, when popConsumerKVServiceEnable and popConsumerKVServiceInit are both true. + */ private boolean popConsumerFSServiceInit = true; private boolean popConsumerKVServiceLog = false; private boolean popConsumerKVServiceInit = false; @@ -463,7 +468,7 @@ public class BrokerConfig extends BrokerIdentity { private boolean usePIDColdCtrStrategy = true; private long cgColdReadThreshold = 3 * 1024 * 1024; private long globalColdReadThreshold = 100 * 1024 * 1024; - + /** * The interval to fetch namesrv addr, default value is 10 second */ @@ -2110,11 +2115,11 @@ public boolean isUseStaticSubscription() { public void setUseStaticSubscription(boolean useStaticSubscription) { this.useStaticSubscription = useStaticSubscription; } - + public long getFetchNamesrvAddrInterval() { return fetchNamesrvAddrInterval; } - + public void setFetchNamesrvAddrInterval(final long fetchNamesrvAddrInterval) { this.fetchNamesrvAddrInterval = fetchNamesrvAddrInterval; } diff --git a/common/src/main/java/org/apache/rocketmq/common/config/AbstractRocksDBStorage.java b/common/src/main/java/org/apache/rocketmq/common/config/AbstractRocksDBStorage.java index 4875ce43e22..f94f93ee07a 100644 --- a/common/src/main/java/org/apache/rocketmq/common/config/AbstractRocksDBStorage.java +++ b/common/src/main/java/org/apache/rocketmq/common/config/AbstractRocksDBStorage.java @@ -57,6 +57,17 @@ import org.rocksdb.WriteBatch; import org.rocksdb.WriteOptions; +/** + * Abstract base class for all RocksDB-backed storage in RocketMQ. + * + *

Provides common CRUD operations ({@code put}, {@code get}, {@code delete}, + * {@code batchPut}, {@code rangeDelete}, {@code iterate}), lifecycle management + * ({@code start}, {@code shutdown}), automatic recovery on corruption + * ({@code scheduleReloadRocksdb}), and manual compaction scheduling. + * + *

Subclasses define column families in {@link #postLoad()} and handle + * their own cleanup in {@link #preShutdown()}. + */ public abstract class AbstractRocksDBStorage { protected static final Logger LOGGER = LoggerFactory.getLogger(LoggerName.ROCKSDB_LOGGER_NAME); @@ -76,10 +87,14 @@ public abstract class AbstractRocksDBStorage { protected RocksDB db; protected DBOptions options; + // WriteOptions with WAL disabled for high-throughput index writes that can be rebuilt protected WriteOptions writeOptions; + // WriteOptions with WAL enabled for durability-critical writes (trans, timer checkpoints) protected WriteOptions ableWalWriteOptions; + // ReadOptions using prefix seek (fast, index-friendly) protected ReadOptions readOptions; + // ReadOptions using total-order seek (slower, required for range scans without prefix) protected ReadOptions totalOrderReadOptions; protected CompactionOptions compactionOptions; @@ -93,8 +108,10 @@ public abstract class AbstractRocksDBStorage { protected volatile boolean loaded; protected CompressionType compressionType = CompressionType.LZ4_COMPRESSION; + // Set to true when a reload is scheduled, causing hold() to reject new operations private volatile boolean closed; + // Guard to ensure only one reload attempt at a time private final Semaphore reloadPermit = new Semaphore(1); private final ScheduledExecutorService reloadScheduler = ThreadUtils.newScheduledThreadPool(1, new ThreadFactoryImpl("RocksDBStorageReloadService_")); private final ThreadPoolExecutor manualCompactionThread = (ThreadPoolExecutor) ThreadUtils.newThreadPoolExecutor( @@ -175,6 +192,16 @@ protected void initFlushOptions() { this.flushOptions = new FlushOptions(); } + /** + * check RocksDB status. isReady maybe a better name. + * + *

Called before every read/write operation. Returns {@code true} if the + * database is fully loaded, the handle is non-null, and the instance has not + * been closed (e.g. due to a scheduled reload). Subclasses may override + * {@link #release()} to pair with this call (e.g. for reference counting). + * + * @return {@code true} if the database is ready for operations + */ public boolean hold() { if (!this.loaded || this.db == null || this.closed) { LOGGER.error("hold rocksdb Failed. {}", this.dbPath); @@ -333,6 +360,17 @@ public void iterate(ColumnFamilyHandle columnFamilyHandle, final byte[] prefix, iterate(columnFamilyHandle, prefix, null, null, callback); } + /** + * Iterate over keys in a column family with optional prefix and range bounds. + * + *

If a prefix is given without an explicit start, the prefix serves as + * the lower bound and iteration continues until keys deviate from the prefix. + * If a start key is given, it takes precedence over the prefix for seeking. + * + * @param prefix optional lower-bound prefix; iteration stops when key deviates + * @param start optional explicit start key (overrides prefix for seek) + * @param end optional upper bound (exclusive) + */ public void iterate(ColumnFamilyHandle columnFamilyHandle, byte[] prefix, final byte[] start, final byte[] end, BiConsumer callback) throws RocksDBException { @@ -644,6 +682,15 @@ public List getCompactionStatus() { } } + /** + * Auto-recovery hook: when a write/delete/flush operation throws a RocksDB + * exception with code {@code Aborted}, {@code Corruption}, or + * {@code Undefined}, schedule a full reload of the RocksDB instance after + * a 10-second delay. The {@link #reloadPermit} semaphore ensures only one + * reload is scheduled at a time. While the reload is in progress, + * {@link #closed} is set to true, causing {@link #hold()} to reject all + * new operations. + */ private void scheduleReloadRocksdb(RocksDBException rocksDBException) { if (rocksDBException == null || rocksDBException.getStatus() == null) { return; diff --git a/common/src/main/java/org/apache/rocketmq/common/utils/DataConverter.java b/common/src/main/java/org/apache/rocketmq/common/utils/DataConverter.java index cc96770b22a..474179d52f8 100644 --- a/common/src/main/java/org/apache/rocketmq/common/utils/DataConverter.java +++ b/common/src/main/java/org/apache/rocketmq/common/utils/DataConverter.java @@ -19,15 +19,33 @@ import java.nio.ByteBuffer; import java.nio.charset.Charset; +/** + * Bit-level utility methods, primarily used by Pop-mode ack tracking. + * + *

An {@code int} bitmask is used to track the ack state of up to 32 sub-messages + * within a single Pop checkpoint (see {@code PopCheckPoint}). + */ public class DataConverter { public static final Charset CHARSET_UTF8 = Charset.forName("UTF-8"); + /** + * Convert a {@code long} to an 8-byte array (big-endian). + */ public static byte[] Long2Byte(Long v) { ByteBuffer tmp = ByteBuffer.allocate(8); tmp.putLong(v); return tmp.array(); } + /** + * Set or clear the bit at {@code index} in an int bitmask. + *

Uses {@code 1L} (long literal) to avoid signed-int overflow when {@code index == 31}. + * + * @param value the original bitmask + * @param index the bit position (0-based, 0..31) + * @param flag {@code true} to set, {@code false} to clear + * @return the updated bitmask + */ public static int setBit(int value, int index, boolean flag) { if (flag) { return (int) (value | (1L << index)); @@ -36,6 +54,13 @@ public static int setBit(int value, int index, boolean flag) { } } + /** + * Test whether the bit at {@code index} is set in an int bitmask. + * + * @param value the bitmask + * @param index the bit position (0-based, 0..31) + * @return {@code true} if the bit is 1 + */ public static boolean getBit(int value, int index) { return (value & (1L << index)) != 0; } diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/ProxyStartup.java b/proxy/src/main/java/org/apache/rocketmq/proxy/ProxyStartup.java index 1b38a19ae6a..94e4f4ffc8b 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/ProxyStartup.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/ProxyStartup.java @@ -55,6 +55,14 @@ public class ProxyStartup { private static final Logger log = LoggerFactory.getLogger(LoggerName.PROXY_LOGGER_NAME); + /** + * proxy components container, manager components with method start/shutdown/... + * - gRPC thread pool executor + * - message processor (wrap broker controller) + * - grpc server + * - remoting protocol server + * - ... + */ private static final ProxyStartAndShutdown PROXY_START_AND_SHUTDOWN = new ProxyStartAndShutdown(); private static class ProxyStartAndShutdown extends AbstractStartAndShutdown { @@ -73,8 +81,10 @@ public static void main(String[] args) { // init thread pool monitor for proxy. initThreadPoolMonitor(); + // init business thread pool for grpc server ThreadPoolExecutor executor = createServerExecutor(); + // create message processor, wrap broker controller in local mode MessagingProcessor messagingProcessor = createMessagingProcessor(); // tls cert update diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/config/ProxyConfig.java b/proxy/src/main/java/org/apache/rocketmq/proxy/config/ProxyConfig.java index 30681098358..6c606d06a4a 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/config/ProxyConfig.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/config/ProxyConfig.java @@ -264,7 +264,7 @@ public class ProxyConfig implements ConfigFile { private String remotingAccessAddr = ""; private int remotingListenPort = 8080; - // related to proxy's send strategy in cluster mode. + // related to proxy's sending strategy in cluster mode. private boolean sendLatencyEnable = false; private boolean startDetectorEnable = false; private int detectTimeout = 200; @@ -272,9 +272,38 @@ public class ProxyConfig implements ConfigFile { private int remotingHeartbeatThreadPoolNums = 2 * PROCESSOR_NUMBER; private int remotingTopicRouteThreadPoolNums = 2 * PROCESSOR_NUMBER; + /** + * thread pool number for + * 1. send message(and send message v2) + * 2. send batch message + * 3. consume send message back + * 4. end transaction + * 5. recall message + */ private int remotingSendMessageThreadPoolNums = 4 * PROCESSOR_NUMBER; + /** + * thread pool number for + * 1. pull message + * 2. lite pull message + * 3. pop message + */ private int remotingPullMessageThreadPoolNums = 4 * PROCESSOR_NUMBER; + /** + * thread pool number for + * 1. update consumer offset + * 2. ack message + * 3. change message invisible time + * 4. get consumer connection list + */ private int remotingUpdateOffsetThreadPoolNums = 4 * PROCESSOR_NUMBER; + /** + * thread pool number for + * 1. unregister client + * 2. check client config + * 3. get consumer list by group + * 4. get min/max offset, query consume offset, search offset by timestamp + * 5. lock/unlock batch mq + */ private int remotingDefaultThreadPoolNums = 4 * PROCESSOR_NUMBER; private int remotingHeartbeatThreadPoolQueueCapacity = 50000; diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/GrpcMessagingApplication.java b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/GrpcMessagingApplication.java index 3429ad54e27..12508d32108 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/GrpcMessagingApplication.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/GrpcMessagingApplication.java @@ -73,6 +73,15 @@ import org.apache.rocketmq.proxy.grpc.v2.common.ResponseWriter; import org.apache.rocketmq.proxy.processor.MessagingProcessor; +/** + * RocketMQ gRPC protocol implementation + * + *

    + *
  • implements gRPC protocol
  • + *
  • execute request in independent thread pool
  • + *
  • execute pipeline, ...
  • + *
+ */ public class GrpcMessagingApplication extends MessagingServiceGrpc.MessagingServiceImplBase implements StartAndShutdown { private final static Logger log = LoggerFactory.getLogger(LoggerName.PROXY_LOGGER_NAME); @@ -168,6 +177,16 @@ protected Status convertExceptionToStatus(Throwable t) { return ResponseBuilder.getInstance().buildStatus(t); } + /** + * submit grpc task to related thread pool. + * + * @param executor thread pool + * @param context context + * @param request grpc request + * @param runnable process task + * @param responseObserver grpc response observer + * @param statusResponseCreator error response creator + */ protected void addExecutor(ExecutorService executor, ProxyContext context, V request, Runnable runnable, StreamObserver responseObserver, Function statusResponseCreator) { if (request instanceof GeneratedMessageV3) { @@ -201,6 +220,12 @@ protected void validateContext(ProxyContext context) { } } + /** + * route query api, producer/consumer will call this api while starting. + * + * @param request request + * @param responseObserver gRPC response observer + */ @Override public void queryRoute(QueryRouteRequest request, StreamObserver responseObserver) { Function statusResponseCreator = status -> QueryRouteResponse.newBuilder().setStatus(status).build(); @@ -218,6 +243,12 @@ public void queryRoute(QueryRouteRequest request, StreamObserver responseObserver) { Function statusResponseCreator = status -> HeartbeatResponse.newBuilder().setStatus(status).build(); @@ -252,6 +283,12 @@ public void sendMessage(SendMessageRequest request, StreamObserver responseObserver) { @@ -420,6 +457,15 @@ public void syncLiteSubscription(SyncLiteSubscriptionRequest request, } } + /** + * telemetry API + * + *
    + *
  • register producer/consumer
  • + *
  • process trace
  • + *
  • verify message result
  • + *
+ */ @Override public StreamObserver telemetry(StreamObserver responseObserver) { Function statusResponseCreator = status -> TelemetryCommand.newBuilder().setStatus(status).build(); diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/channel/GrpcClientChannel.java b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/channel/GrpcClientChannel.java index 0135818fb3b..06b2ed95012 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/channel/GrpcClientChannel.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/channel/GrpcClientChannel.java @@ -65,6 +65,9 @@ public class GrpcClientChannel extends ProxyChannel implements ChannelExtendAttr private final GrpcChannelManager grpcChannelManager; private final GrpcClientSettingsManager grpcClientSettingsManager; + // gRPC Telemetry bidirectional stream observer for server-to-client push commands. + // Set by setClientObserver() on Telemetry connection establishment, cleared by clearClientObserver() + // on write failure. isOpen/isActive/isWritable all delegate to this ref being non-null. private final AtomicReference> telemetryCommandRef = new AtomicReference<>(); private final Object telemetryWriteLock = new Object(); private final String clientId; @@ -260,6 +263,15 @@ public String getClientId() { return clientId; } + /** + * Write a {@link TelemetryCommand} to the gRPC telemetry stream. + * + *

Uses double-checked locking with {@link #telemetryWriteLock} to + * safely handle concurrent writes. If the underlying gRPC stream is + * closed or throws {@link StatusRuntimeException} / + * {@link IllegalStateException} (e.g. client disconnected), the observer + * is cleared so subsequent writes are silently skipped. + */ public void writeTelemetryCommand(TelemetryCommand command) { StreamObserver observer = this.telemetryCommandRef.get(); if (observer == null) { diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/consumer/ReceiveMessageActivity.java b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/consumer/ReceiveMessageActivity.java index becf2c2165d..83b5e3a89a8 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/consumer/ReceiveMessageActivity.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/consumer/ReceiveMessageActivity.java @@ -59,11 +59,22 @@ public ReceiveMessageActivity(MessagingProcessor messagingProcessor, super(messagingProcessor, grpcClientSettingsManager, grpcChannelManager); } + /** + * + * @param ctx ctx + * @param request + * request.invisible_duration => + * Required if client type is simple consumer. + * useless for PushConsumer + * + * @param responseObserver responseObserver + */ public void receiveMessage(ProxyContext ctx, ReceiveMessageRequest request, StreamObserver responseObserver) { ReceiveMessageResponseStreamWriter writer = createWriter(ctx, responseObserver); try { + // Settings were registered when client connected Settings settings = this.grpcClientSettingsManager.getClientSettings(ctx); final boolean isLite = ClientType.LITE_PUSH_CONSUMER.equals(settings.getClientType()); @@ -104,9 +115,13 @@ public void receiveMessage(ProxyContext ctx, ReceiveMessageRequest request, String topic = request.getMessageQueue().getTopic().getName(); String group = request.getGroup().getName(); + // invisibleTime was set by client + // proxy can override it long actualInvisibleTime = Durations.toMillis(request.getInvisibleDuration()); ProxyConfig proxyConfig = ConfigurationManager.getProxyConfig(); + // default enableProxyAutoRenew is true if (proxyConfig.isEnableProxyAutoRenew() && request.getAutoRenew()) { + // default defaultInvisibleTimeMills is 60s actualInvisibleTime = proxyConfig.getDefaultInvisibleTimeMills(); } else { validateInvisibleTime(actualInvisibleTime, @@ -192,13 +207,37 @@ public void receiveMessage(ProxyContext ctx, ReceiveMessageRequest request, } } + /** + * Register receipt handles for auto-renewal of message visibility timeouts. + * + *

When auto-renew is enabled ({@code enableProxyAutoRenew}), the proxy + * periodically extends the invisible time of delivered but unacked messages + * so that they are not revived while the consumer is still processing them. + * + *

This method extracts the {@code PROPERTY_POP_CK} from each popped + * message, wraps it into a {@link MessageReceiptHandle}, and registers it + * via {@link MessagingProcessor#addReceiptHandle}. The returned + * {@link Runnable} is executed after the response has been written to the + * client stream. + * + * @param ctx the proxy context + * @param request the original receive-message request + * @param group consumer group + * @param topic topic name + * @param popResult the pop result returned from the broker + * @param writer the response stream writer + * @return a runnable to execute after the response write, or {@code null} + * if no messages were found + */ private Runnable handleAutoRenew(ProxyContext ctx, ReceiveMessageRequest request, String group, String topic, PopResult popResult, ReceiveMessageResponseStreamWriter writer ) { + // check result status if (!PopStatus.FOUND.equals(popResult.getPopStatus())) { return null; } + // get socket channel GrpcClientChannel clientChannel = grpcChannelManager.getChannel(ctx.getClientID()); if (clientChannel == null) { GrpcProxyException e = new GrpcProxyException(Code.MESSAGE_NOT_FOUND, @@ -207,6 +246,7 @@ private Runnable handleAutoRenew(ProxyContext ctx, ReceiveMessageRequest request writer.processThrowableWhenWriteMessage(e, ctx, request, messageExt)); throw e; } + return () -> { List messageExtList = popResult.getMsgFoundList(); for (MessageExt messageExt : messageExtList) { diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/producer/SendMessageActivity.java b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/producer/SendMessageActivity.java index c0138cae7fa..b9925d98909 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/producer/SendMessageActivity.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/producer/SendMessageActivity.java @@ -63,6 +63,20 @@ public SendMessageActivity(MessagingProcessor messagingProcessor, super(messagingProcessor, grpcClientSettingsManager, grpcChannelManager); } + /** + * send message, execute in producer thread pool + * request flow: + * producer -> grpcRequest -> GrpcMessagingApplication -> ProducerThreadPoolForGrpc(...) + * functionality: + * 1. validate topic + * 2. create queue selector + * 3. build and validate message + * 4. convert response + * + * @param ctx proxy context + * @param request send message request + * @return send message response future + */ public CompletableFuture sendMessage(ProxyContext ctx, SendMessageRequest request) { CompletableFuture future = new CompletableFuture<>(); diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/route/RouteActivity.java b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/route/RouteActivity.java index 75f7089c5e0..87004c9c653 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/route/RouteActivity.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/grpc/v2/route/RouteActivity.java @@ -60,6 +60,31 @@ public RouteActivity(MessagingProcessor messagingProcessor, super(messagingProcessor, grpcClientSettingsManager, grpcChannelManager); } + /** + * query route info by topic + * + * @param ctx ctx + * @param request { + * topic: xxx, + * endpoints: from client config, it is an endpoint list, it's a bad design + * } + * @return route info { + * status: xxx, + * message_queues: [ + * { + * topic: xxx, + * id: xxx, // queue id + * permission: (enum)xxx, + * broker: { + * name: xxx, + * id: xxx, + * endPoints: xxx, + * }, + * accept_message_type: xxx + * }, ... + * ] + * } + */ public CompletableFuture queryRoute(ProxyContext ctx, QueryRouteRequest request) { CompletableFuture future = new CompletableFuture<>(); try { @@ -96,6 +121,34 @@ public CompletableFuture queryRoute(ProxyContext ctx, QueryR return future; } + /** + * + * @param ctx context + * @param request request { + * topic: xxx, + * group: xxx, + * endpoints: xxx + * } + * @return response { + * status: xxx, + * assignments: [ + * { + * message_queue: { + * topic: xxx, + * id: xxx, // queue id + * permission: (enum)xxx, + * broker: { + * name: xxx, + * id: xxx, + * endPoints: xxx, + * }, + * accept_message_type: xxx + * } + * }, + * ... + * ] + * } + */ public CompletableFuture queryAssignment(ProxyContext ctx, QueryAssignmentRequest request) { CompletableFuture future = new CompletableFuture<>(); diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/processor/ProducerProcessor.java b/proxy/src/main/java/org/apache/rocketmq/proxy/processor/ProducerProcessor.java index 8c4907c588a..e84045e0646 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/processor/ProducerProcessor.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/processor/ProducerProcessor.java @@ -66,6 +66,14 @@ public ProducerProcessor(MessagingProcessor messagingProcessor, this.topicMessageTypeValidator = new DefaultTopicMessageTypeValidator(); } + /** + * send message + * 1. validate message type + * 2. select queue + * 3. set message id if not set + * 4. call message service + * 5. fill transaction data if send succeed and is transaction message + */ public CompletableFuture> sendMessage(ProxyContext ctx, QueueSelector queueSelector, String producerGroup, int sysFlag, List messageList, long timeoutMillis) { CompletableFuture> future = new CompletableFuture<>(); @@ -96,6 +104,7 @@ public CompletableFuture> sendMessage(ProxyContext ctx, QueueSe SendMessageRequestHeader requestHeader = buildSendMessageRequestHeader(messageList, producerGroup, sysFlag, messageQueue.getQueueId()); AddressableMessageQueue finalMessageQueue = messageQueue; + // call SendMessageProcessor of broker future = this.serviceManager.getMessageService().sendMessage( ctx, messageQueue, diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/processor/ReceiptHandleProcessor.java b/proxy/src/main/java/org/apache/rocketmq/proxy/processor/ReceiptHandleProcessor.java index bc3730aed9a..555b78f1906 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/processor/ReceiptHandleProcessor.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/processor/ReceiptHandleProcessor.java @@ -29,17 +29,50 @@ import org.apache.rocketmq.proxy.service.ServiceManager; import org.apache.rocketmq.proxy.service.receipt.DefaultReceiptHandleManager; +/** + * Bridges receipt handle renewal events to the messaging processor. + * + *

Owns a {@link DefaultReceiptHandleManager} and wires its + * {@link RenewEvent} listener to {@link MessagingProcessor#changeInvisibleTime}. + * When a receipt handle is about to expire, the manager fires a {@code RENEW} + * event which this processor translates into a + * {@code ChangeInvisibleTime} call. + * + *

When the renewal limit is reached, a {@code STOP_RENEW} event fires + * which nacks the message via {@code changeInvisibleTime} with the group's + * retry policy delay. + */ public class ReceiptHandleProcessor extends AbstractProcessor { protected final static Logger log = LoggerFactory.getLogger(LoggerName.PROXY_LOGGER_NAME); protected DefaultReceiptHandleManager receiptHandleManager; + /** + * Wire the receipt handle manager to the messaging processor. + * + *

pass StateEventListener to DefaultReceiptHandleManager + * so that when DefaultReceiptHandleManager find the message is expired, + * call StateEventListener to change the invisible time of the message. + * + *

Creates an event listener that translates all {@link RenewEvent} + * types ({@code RENEW}, {@code STOP_RENEW}, {@code CLEAR_GROUP}) into + * {@link MessagingProcessor#changeInvisibleTime} calls, which update + * the message's visibility timeout on the broker. + * + * @param messagingProcessor the core messaging processor + * @param serviceManager the service manager providing metadata and consumer services + */ public ReceiptHandleProcessor(MessagingProcessor messagingProcessor, ServiceManager serviceManager) { super(messagingProcessor, serviceManager); + + // create event listener StateEventListener eventListener = event -> { + // convert event to ReceiptHandle ProxyContext context = createContext(event.getEventType().name()) .setChannel(event.getKey().getChannel()); MessageReceiptHandle messageReceiptHandle = event.getMessageReceiptHandle(); ReceiptHandle handle = ReceiptHandle.decode(messageReceiptHandle.getReceiptHandleStr()); + + // change invisible time messagingProcessor .changeInvisibleTime(context, handle, messageReceiptHandle.getMessageId(), messageReceiptHandle.getGroup(), messageReceiptHandle.getTopic(), @@ -52,6 +85,8 @@ public ReceiptHandleProcessor(MessagingProcessor messagingProcessor, ServiceMana event.getFuture().complete(v); }); }; + + // pass event listener to DefaultReceiptHandleManager this.receiptHandleManager = new DefaultReceiptHandleManager(serviceManager.getMetadataService(), serviceManager.getConsumerManager(), eventListener); this.appendStartAndShutdown(receiptHandleManager); } diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/service/message/MessageService.java b/proxy/src/main/java/org/apache/rocketmq/proxy/service/message/MessageService.java index 1e828c36fd9..a92010d45b4 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/service/message/MessageService.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/service/message/MessageService.java @@ -47,6 +47,9 @@ public interface MessageService { + /** + * call SendMessageProcessor of broker + */ CompletableFuture> sendMessage( ProxyContext ctx, AddressableMessageQueue messageQueue, diff --git a/proxy/src/main/java/org/apache/rocketmq/proxy/service/receipt/DefaultReceiptHandleManager.java b/proxy/src/main/java/org/apache/rocketmq/proxy/service/receipt/DefaultReceiptHandleManager.java index f9dfd825337..9f168a3128b 100644 --- a/proxy/src/main/java/org/apache/rocketmq/proxy/service/receipt/DefaultReceiptHandleManager.java +++ b/proxy/src/main/java/org/apache/rocketmq/proxy/service/receipt/DefaultReceiptHandleManager.java @@ -60,6 +60,18 @@ import org.apache.rocketmq.remoting.protocol.subscription.RetryPolicy; import org.apache.rocketmq.remoting.protocol.subscription.SubscriptionGroupConfig; +/** + * Manages receipt handles for gRPC proxy auto-renewal of message visibility timeouts. + * + *

When auto-renew is enabled, popped messages are registered here with their + * {@code PROPERTY_POP_CK} data. A periodic {@link #scheduledExecutorService} scans + * all registered handles and extends the invisible time for messages that are + * about to expire. When the total renewal duration exceeds + * {@code renewMaxTimeMillis}, the message is nack'd and returned to the broker. + * + *

Handles are grouped by {@link ReceiptHandleGroupKey} (channel + consumer group) + * and cleaned up automatically when a gRPC client disconnects. + */ public class DefaultReceiptHandleManager extends AbstractStartAndShutdown implements ReceiptHandleManager { protected final static Logger log = LoggerFactory.getLogger(LoggerName.PROXY_LOGGER_NAME); protected final MetadataService metadataService; @@ -77,6 +89,8 @@ public DefaultReceiptHandleManager(MetadataService metadataService, ConsumerMana this.consumerManager = consumerManager; this.eventListener = eventListener; ProxyConfig proxyConfig = ConfigurationManager.getProxyConfig(); + + // by default, minThreadNum is 2, maxThreadNum is 4 this.renewalWorkerService = ThreadPoolMonitor.createAndMonitor( proxyConfig.getRenewThreadPoolNums(), proxyConfig.getRenewMaxThreadPoolNums(), @@ -84,6 +98,8 @@ public DefaultReceiptHandleManager(MetadataService metadataService, ConsumerMana "RenewalWorkerThread", proxyConfig.getRenewThreadPoolQueueCapacity() ); + + // by default, minThreadNum is 2, maxThreadNum is 4 this.returnHandleGroupWorkerService = ThreadPoolMonitor.createAndMonitor( proxyConfig.getReturnHandleGroupThreadPoolNums(), proxyConfig.getReturnHandleGroupThreadPoolNums() * 2, @@ -91,6 +107,8 @@ public DefaultReceiptHandleManager(MetadataService metadataService, ConsumerMana "ReturnHandleGroupWorkerThread", proxyConfig.getRenewThreadPoolQueueCapacity() ); + + // clear receipt by group when consumer unregister consumerManager.appendConsumerIdsChangeListener(new ConsumerIdsChangeListener() { @Override public void handle(ConsumerGroupEvent event, String group, Object... args) { @@ -115,11 +133,15 @@ public void shutdown() { } }); + this.receiptHandleGroupMap = new ConcurrentHashMap<>(); this.renewalWorkerService.setRejectedExecutionHandler((r, executor) -> log.warn("add renew task failed. queueSize:{}", executor.getQueue().size())); + + // add periodic scan task this.appendStartAndShutdown(new StartAndShutdown() { @Override public void start() throws Exception { + // by default, interval is 5000ms scheduledExecutorService.scheduleWithFixedDelay(() -> scheduleRenewTask(), 0, ConfigurationManager.getProxyConfig().getRenewSchedulePeriodMillis(), TimeUnit.MILLISECONDS); } @@ -154,6 +176,23 @@ protected boolean clientIsOffline(ReceiptHandleGroupKey groupKey) { return this.consumerManager.findChannel(groupKey.getGroup(), groupKey.getChannel()) == null; } + /** + * Periodic scan of all receipt handle groups, called by the + * {@link #scheduledExecutorService} at a fixed interval. + * + *

For each group: + *

    + *
  • If the client has gone offline, the entire group is cleared + * immediately
  • + *
  • Otherwise, each handle is inspected — if its next visible time + * minus the current time is within the {@code renewAheadTimeMillis} + * threshold, a renewal is submitted to the + * {@link #renewalWorkerService} thread pool
  • + *
+ * + *

The scan runs synchronously in the scheduler thread; the actual + * renewal work is dispatched asynchronously to the worker pool. + */ protected void scheduleRenewTask() { Stopwatch stopwatch = Stopwatch.createStarted(); try { @@ -191,15 +230,39 @@ protected void renewMessage(ProxyContext context, ReceiptHandleGroupKey key, Rec } } + /** + * Renew a single message's visibility timeout, or stop if the renewal + * limit has been reached. + * + *

Decision logic: + *

    + *
  • If the handle has exceeded {@code maxRenewRetryTimes}, it is + * dropped
  • + *
  • If the total renewal duration is within {@code renewMaxTimeMillis}, + * a {@link RenewEvent.EventType#RENEW} event is fired to extend the + * invisible time
  • + *
  • If the renewal duration has exceeded {@code renewMaxTimeMillis}, + * a {@link RenewEvent.EventType#STOP_RENEW} event is fired which + * nacks the message with the group's retry policy delay
  • + *
+ * + * @param context the proxy context + * @param key the receipt handle group key + * @param messageReceiptHandle the handle to renew + * @return a future completing with the updated handle (or {@code null} if + * renewal is stopped) + */ protected CompletableFuture startRenewMessage(ProxyContext context, ReceiptHandleGroupKey key, MessageReceiptHandle messageReceiptHandle) { CompletableFuture resFuture = new CompletableFuture<>(); ProxyConfig proxyConfig = ConfigurationManager.getProxyConfig(); long current = System.currentTimeMillis(); try { + // by default, maxRenewRetryTimes is 3 if (messageReceiptHandle.getRenewRetryTimes() >= proxyConfig.getMaxRenewRetryTimes()) { log.warn("handle has exceed max renewRetryTimes. handle:{}", messageReceiptHandle); return CompletableFuture.completedFuture(null); } + if (current - messageReceiptHandle.getConsumeTimestamp() < proxyConfig.getRenewMaxTimeMillis()) { CompletableFuture future = new CompletableFuture<>(); eventListener.fireEvent(new RenewEvent(key, messageReceiptHandle, RENEW_POLICY.nextDelayDuration(messageReceiptHandle.getRenewTimes()), RenewEvent.EventType.RENEW, future)); diff --git a/store/src/main/java/org/apache/rocketmq/store/CommitLog.java b/store/src/main/java/org/apache/rocketmq/store/CommitLog.java index d2f2da8b7d3..d2d277de9af 100644 --- a/store/src/main/java/org/apache/rocketmq/store/CommitLog.java +++ b/store/src/main/java/org/apache/rocketmq/store/CommitLog.java @@ -621,7 +621,7 @@ public DispatchRequest checkMessageAndReturnSize(java.nio.ByteBuffer byteBuffer, fullMessageBuffer.limit(messageStartPos + totalSize); byte[] fullMessageBytes = new byte[totalSize]; fullMessageBuffer.get(fullMessageBytes, 0, totalSize); - + // Print full message and especially properties log.warn( "CommitLog#checkAndDispatchMessage: failed to check message CRC, not found CRC in properties. topic={}, properties={}, propertiesLength={}, fullMessageHex={}", @@ -966,7 +966,26 @@ public void updateMaxMessageSize(PutMessageThreadLocal putMessageThreadLocal) { } } + /** + * Asynchronously encode and append a single message to the commit log. + * + *

The method: + *

    + *
  1. Encodes the message and validates compat flags (V1/V2, IPv6)
  2. + *
  3. Acquires a per-topic-queue-lock for offset assignment, then the + * global put-message lock for the append
  4. + *
  5. Acquires the commitLog write lock for MappedFile writing
  6. + *
  7. Appends to the current mapped file; if full, opens a new file + * and retries
  8. + *
  9. After the append, increments the consume queue offset and + * triggers HA replication if configured
  10. + *
+ * + * @param msg the message to write + * @return a future that completes with the append result + */ public CompletableFuture asyncPutMessage(final MessageExtBrokerInner msg) { + // format message and int context params // Set the storage time if (!defaultMessageStore.getMessageStoreConfig().isDuplicationEnable()) { msg.setStoreTimestamp(System.currentTimeMillis()); @@ -1004,6 +1023,8 @@ public CompletableFuture asyncPutMessage(final MessageExtBroke updateMaxMessageSize(putMessageThreadLocal); String topicQueueKey = generateKey(putMessageThreadLocal.getKeyBuilder(), msg); long elapsedTimeInLock = 0; + + // locate mappedFile and get write position MappedFile unlockMappedFile = null; MappedFile mappedFile = this.mappedFileQueue.getLastMappedFile(); @@ -1014,6 +1035,7 @@ public CompletableFuture asyncPutMessage(final MessageExtBroke currOffset = mappedFile.getFileFromOffset() + mappedFile.getWrotePosition(); } + // handle HA int needAckNums = this.defaultMessageStore.getMessageStoreConfig().getInSyncReplicas(); boolean needHandleHA = needHandleHA(msg); @@ -1037,7 +1059,7 @@ public CompletableFuture asyncPutMessage(final MessageExtBroke topicQueueLock.lock(topicQueueKey); try { - + // assign consume queue offset boolean needAssignOffset = true; if (defaultMessageStore.getMessageStoreConfig().isDuplicationEnable() && defaultMessageStore.getMessageStoreConfig().getBrokerRole() != BrokerRole.SLAVE) { @@ -1047,6 +1069,7 @@ public CompletableFuture asyncPutMessage(final MessageExtBroke defaultMessageStore.assignOffset(msg); } + // encode message and ... PutMessageResult encodeResult = putMessageThreadLocal.getEncoder().encode(msg); if (encodeResult != null) { return CompletableFuture.completedFuture(encodeResult); @@ -1056,6 +1079,7 @@ public CompletableFuture asyncPutMessage(final MessageExtBroke putMessageLock.lock(); //spin or ReentrantLock, depending on store config try { + // validate and init some context params long beginLockTimestamp = this.defaultMessageStore.getSystemClock().now(); this.beginTimeInLock = beginLockTimestamp; @@ -1076,7 +1100,10 @@ public CompletableFuture asyncPutMessage(final MessageExtBroke return CompletableFuture.completedFuture(new PutMessageResult(PutMessageStatus.CREATE_MAPPED_FILE_FAILED, null)); } + // append to mappedFile result = mappedFile.appendMessage(msg, this.appendMessageCallback, putMessageContext); + + // check result, retry if needed, switch (result.getStatus()) { case PUT_OK: onCommitLogAppend(msg, result, mappedFile); @@ -1112,6 +1139,7 @@ public CompletableFuture asyncPutMessage(final MessageExtBroke beginTimeInLock = 0; putMessageLock.unlock(); } + // Increase queue offset when messages are successfully written if (AppendMessageStatus.PUT_OK.equals(result.getStatus())) { this.defaultMessageStore.increaseOffset(msg, getMessageNum(msg)); @@ -1122,6 +1150,7 @@ public CompletableFuture asyncPutMessage(final MessageExtBroke topicQueueLock.unlock(topicQueueKey); } + // unlock MappedFile and metrics if (elapsedTimeInLock > 500) { log.warn("[NOTIFYME]putMessage in lock cost time(ms)={}, bodyLength={} AppendMessageResult={}", elapsedTimeInLock, msg.getBody().length, result); } @@ -1327,6 +1356,20 @@ private boolean needHandleHA(MessageExt messageExt) { return true; } + /** + * Wait for both disk flush and HA replication to complete, then merge results. + * + *

Disk flush and HA replication run in parallel via + * {@link CompletableFuture#thenCombine}. If either fails, the combined + * result is updated with the failure status — both must succeed for + * the overall result to be {@code PUT_OK}. + * + * @param putMessageResult the append result to update + * @param messageExt the original message (needed by flush) + * @param needAckNums number of slave acks required (0/1 = no HA) + * @param needHandleHA whether HA replication is configured + * @return a future completing with the merged result + */ private CompletableFuture handleDiskFlushAndHA(PutMessageResult putMessageResult, MessageExt messageExt, int needAckNums, boolean needHandleHA) { CompletableFuture flushResultFuture = handleDiskFlush(putMessageResult.getAppendMessageResult(), messageExt); @@ -1406,6 +1449,20 @@ public long getMinOffset() { return -1; } + /** + * Read a message body from the commit log at the given physical offset. + * + *

The difference between getData is: + * getMessage add process: setInCache + * + *

Finds the mapped file containing the offset and selects a buffer + * for the given size. The returned buffer includes cache-status metadata + * for cold-data flow control. + * + * @param offset physical offset in the commit log + * @param size number of bytes to read + * @return the mapped buffer, or {@code null} if the file is unavailable + */ public SelectMappedBufferResult getMessage(final long offset, final int size) { int mappedFileSize = this.defaultMessageStore.getMessageStoreConfig().getMappedFileSizeCommitLog(); MappedFile mappedFile = this.mappedFileQueue.findMappedFileByOffset(offset, offset == 0); diff --git a/store/src/main/java/org/apache/rocketmq/store/DefaultMessageStore.java b/store/src/main/java/org/apache/rocketmq/store/DefaultMessageStore.java index aee767dae2f..9bfbfca6961 100644 --- a/store/src/main/java/org/apache/rocketmq/store/DefaultMessageStore.java +++ b/store/src/main/java/org/apache/rocketmq/store/DefaultMessageStore.java @@ -211,6 +211,13 @@ public class DefaultMessageStore implements MessageStore { private final AtomicInteger mappedPageHoldCount = new AtomicInteger(0); + /** + * BatchDispatchRequest queue + * offer by ConcurrentReputMessageService.createBatchDispatchRequest() + * poll by MainBatchDispatchRequestService.pollBatchDispatchRequest() + * + *

if enableBuildConsumeQueueConcurrently is false, It is useless + */ private final ConcurrentLinkedQueue batchDispatchRequestQueue = new ConcurrentLinkedQueue<>(); private final int dispatchRequestOrderlyQueueSize = 16; @@ -643,9 +650,20 @@ public long getMajorFileSize() { return commitLogSize + consumeQueueSize + indexFileSize; } + /** + * Asynchronously write a message to the commit log. + * + *

Before writing, any registered {@link PutMessageHook} instances are + * invoked — a non-null result from a hook short-circuits the process. + * Inner-batch message flags are validated + * then the actual write is delegated to {@link CommitLog#asyncPutMessage}. + * + * @param msg the message to write + * @return a future that completes with the put result + */ @Override public CompletableFuture asyncPutMessage(MessageExtBrokerInner msg) { - + // execute beforePutMessage hooks for (PutMessageHook putMessageHook : putMessageHookList) { PutMessageResult handleResult = putMessageHook.executeBeforePutMessage(msg); if (handleResult != null) { @@ -653,12 +671,14 @@ public CompletableFuture asyncPutMessage(MessageExtBrokerInner } } + // check inner batch message num if (msg.getProperties().containsKey(MessageConst.PROPERTY_INNER_NUM) && !MessageSysFlag.check(msg.getSysFlag(), MessageSysFlag.INNER_BATCH_FLAG)) { LOGGER.warn("[BUG]The message had property {} but is not an inner batch", MessageConst.PROPERTY_INNER_NUM); return CompletableFuture.completedFuture(new PutMessageResult(PutMessageStatus.MESSAGE_ILLEGAL, null)); } + // check inner batch message topic if (MessageSysFlag.check(msg.getSysFlag(), MessageSysFlag.INNER_BATCH_FLAG)) { Optional topicConfig = this.getTopicConfig(msg.getTopic()); if (!QueueTypeUtils.isBatchCq(topicConfig)) { @@ -667,9 +687,11 @@ public CompletableFuture asyncPutMessage(MessageExtBrokerInner } } + // put message long beginTime = this.getSystemClock().now(); CompletableFuture putResultFuture = this.commitLog.asyncPutMessage(msg); + // metrics putResultFuture.thenAccept(result -> { long elapsedTime = this.getSystemClock().now() - beginTime; if (elapsedTime > 500) { @@ -863,6 +885,33 @@ public CompletableFuture getMessageAsync(String group, String return CompletableFuture.completedFuture(getMessage(group, topic, queueId, offset, maxMsgNums, messageFilter)); } + /** + * Pull messages from the consume queue, applying filters and reading bodies + * from the commit log. + * + *

The method: + *

    + *
  1. Validates the store state and finds the consume queue for the + * topic
  2. + *
  3. Checks the offset against the queue bounds and sets the + * appropriate status if out of range
  4. + *
  5. Iterates through the consume queue entries, applies both + * consume-queue-level and commit-log-level message filters
  6. + *
  7. Reads message bodies from the commit log and appends them to the + * result until the size or count limit is reached
  8. + *
  9. Reports disk-fall-behind metrics and suggests pulling from a + * slave if the data is too far behind in physical offset
  10. + *
+ * + * @param group consumer group + * @param topic topic name + * @param queueId queue id + * @param offset starting offset in the consume queue + * @param maxMsgNums maximum number of messages to return + * @param maxTotalMsgSize maximum total message body size + * @param messageFilter message filter (maybe null) + * @return the pull result with status, messages, and next offset + */ @Override public GetMessageResult getMessage(final String group, final String topic, final int queueId, final long offset, final int maxMsgNums, final int maxTotalMsgSize, final MessageFilter messageFilter) { @@ -876,6 +925,7 @@ public GetMessageResult getMessage(final String group, final String topic, final return null; } + // try to get from compaction store Optional topicConfig = getTopicConfig(topic); CleanupPolicy policy = CleanupPolicyUtils.getDeletePolicy(topicConfig); //check request topic flag @@ -883,6 +933,7 @@ public GetMessageResult getMessage(final String group, final String topic, final return compactionStore.getMessage(group, topic, queueId, offset, maxMsgNums, maxTotalMsgSize); } // else skip + // init context vars long beginTime = this.getSystemClock().now(); GetMessageStatus status = GetMessageStatus.NO_MESSAGE_IN_QUEUE; @@ -900,19 +951,20 @@ public GetMessageResult getMessage(final String group, final String topic, final minOffset = consumeQueue.getMinOffsetInQueue(); maxOffset = consumeQueue.getMaxOffsetInQueue(); - if (maxOffset == 0) { + if (maxOffset == 0) { // empty queue status = GetMessageStatus.NO_MESSAGE_IN_QUEUE; nextBeginOffset = nextOffsetCorrection(offset, 0); - } else if (offset < minOffset) { + } else if (offset < minOffset) { // offset too small status = GetMessageStatus.OFFSET_TOO_SMALL; nextBeginOffset = nextOffsetCorrection(offset, minOffset); - } else if (offset == maxOffset) { + } else if (offset == maxOffset) { // offset overflow one status = GetMessageStatus.OFFSET_OVERFLOW_ONE; nextBeginOffset = nextOffsetCorrection(offset, offset); - } else if (offset > maxOffset) { + } else if (offset > maxOffset) { // offset too big status = GetMessageStatus.OFFSET_OVERFLOW_BADLY; nextBeginOffset = nextOffsetCorrection(offset, maxOffset); - } else { + } else { // offset is ok + // init context vars final int maxFilterMessageSize = Math.max(this.messageStoreConfig.getMaxFilterMessageSize(), maxMsgNums * consumeQueue.getUnitSize()); final boolean diskFallRecorded = this.messageStoreConfig.isDiskFallRecorded(); @@ -925,6 +977,14 @@ public GetMessageResult getMessage(final String group, final String topic, final long maxPhyOffsetPulling = 0; int cqFileNum = 0; + /* + * bufferTotalSize is the total message size + * bufferTotalSize less than 0 means + * the while loop will break after getting more than one messages + * + * travelCqFileNumWhenGetMessage limits the max file nums to travel when get message + * default is 1 + */ while (getResult.getBufferTotalSize() <= 0 && nextBeginOffset < maxOffset && cqFileNum++ < this.messageStoreConfig.getTravelCqFileNumWhenGetMessage()) { @@ -944,6 +1004,7 @@ public GetMessageResult getMessage(final String group, final String topic, final long nextPhyFileStartOffset = Long.MIN_VALUE; while (bufferConsumeQueue.hasNext() && nextBeginOffset < maxOffset) { + // init context params and validate CqUnit cqUnit = bufferConsumeQueue.next(); long offsetPy = cqUnit.getPos(); int sizePy = cqUnit.getSize(); @@ -982,6 +1043,7 @@ public GetMessageResult getMessage(final String group, final String topic, final continue; } + // get message, roll to next file if needed SelectMappedBufferResult selectResult = this.commitLog.getMessage(offsetPy, sizePy); if (null == selectResult) { if (getResult.getBufferTotalSize() == 0) { @@ -992,6 +1054,7 @@ public GetMessageResult getMessage(final String group, final String topic, final continue; } + // handle result if (messageStoreConfig.isColdDataFlowControlEnable() && !MixAll.isSysConsumerGroupPullMessage(group) && !selectResult.isInCache()) { getResult.setColdDataSum(getResult.getColdDataSum() + sizePy); } @@ -1006,6 +1069,7 @@ public GetMessageResult getMessage(final String group, final String topic, final filterMessageCount++; continue; } + this.storeStatsService.getGetMessageTransferredMsgCount().add(cqUnit.getBatchNum()); getResult.addMessage(selectResult, cqUnit.getQueueOffset(), cqUnit.getBatchNum()); status = GetMessageStatus.FOUND; @@ -1021,6 +1085,7 @@ public GetMessageResult getMessage(final String group, final String topic, final } } + // ... if (diskFallRecorded) { long fallBehind = maxOffsetPy - maxPhyOffsetPulling; brokerStatsManager.recordDiskFallBehindSize(group, topic, queueId, fallBehind); @@ -1036,6 +1101,7 @@ public GetMessageResult getMessage(final String group, final String topic, final nextBeginOffset = nextOffsetCorrection(offset, 0); } + // metrics if (GetMessageStatus.FOUND == status) { this.storeStatsService.getGetMessageTimesTotalFound().add(1); } else { @@ -1050,6 +1116,7 @@ public GetMessageResult getMessage(final String group, final String topic, final long elapsedTime = this.getSystemClock().now() - beginTime; this.storeStatsService.setGetMessageEntireTimeMax(elapsedTime); + // format result // lazy init no data found. if (getResult == null) { getResult = new GetMessageResult(0); diff --git a/store/src/main/java/org/apache/rocketmq/store/GetMessageResult.java b/store/src/main/java/org/apache/rocketmq/store/GetMessageResult.java index 6f322a19e19..980f9a7bc89 100644 --- a/store/src/main/java/org/apache/rocketmq/store/GetMessageResult.java +++ b/store/src/main/java/org/apache/rocketmq/store/GetMessageResult.java @@ -21,15 +21,23 @@ import java.util.Collections; import java.util.List; +/** + * result of get message + */ public class GetMessageResult { - + // mappedFile info list private final List messageMapedList; + // message info list in form of ByteBuffer, used by zero copy in version 4.* private final List messageBufferList; + // consume queue offset list private final List messageQueueOffset; private GetMessageStatus status; + // next offset of queue(Consume Queue) private long nextBeginOffset; + // min offset of queue(Consume Queue) private long minOffset; + // max offset of queue(Consume Queue) private long maxOffset; private int bufferTotalSize = 0; diff --git a/store/src/main/java/org/apache/rocketmq/store/MappedFileQueue.java b/store/src/main/java/org/apache/rocketmq/store/MappedFileQueue.java index 94235024da9..afb6e005dc2 100644 --- a/store/src/main/java/org/apache/rocketmq/store/MappedFileQueue.java +++ b/store/src/main/java/org/apache/rocketmq/store/MappedFileQueue.java @@ -686,17 +686,30 @@ public synchronized boolean commit(final int commitLeastPages) { } /** - * Finds a mapped file by offset. + * Locate the mapped file containing the given physical offset. * - * @param offset Offset. - * @param returnFirstOnNotFound If the mapped file is not found, then return the first one. - * @return Mapped file or null (when not found and returnFirstOnNotFound is false). + *

The lookup strategy: + *

    + *
  1. Index-based (O(1)) — computes the index from the offset + * and the first file's offset and accesses the list directly
  2. + *
  3. Iteration (O(n)) — falls back to a linear scan when the + * index-based result does not match the expected range
  4. + *
  5. return first if returnFirstOnNotFound is true
  6. + *
  7. return null if not found
  8. + *
+ * + * @param offset physical offset to find + * @param returnFirstOnNotFound if {@code true}, returns the first mapped + * file when the offset is outside the range + * @return the mapped file, or {@code null} if not found and + * {@code returnFirstOnNotFound} is {@code false} */ public MappedFile findMappedFileByOffset(final long offset, final boolean returnFirstOnNotFound) { try { MappedFile firstMappedFile = this.getFirstMappedFile(); MappedFile lastMappedFile = this.getLastMappedFile(); if (firstMappedFile != null && lastMappedFile != null) { + // offset is not in range of [firstOffset, lastOffset] if (offset < firstMappedFile.getFileFromOffset() || offset >= lastMappedFile.getFileFromOffset() + this.mappedFileSize) { LOG_ERROR.warn("Offset not matched. Request offset: {}, firstOffset: {}, lastOffset: {}, mappedFileSize: {}, mappedFiles count: {}", offset, @@ -705,6 +718,7 @@ public MappedFile findMappedFileByOffset(final long offset, final boolean return this.mappedFileSize, this.mappedFiles.size()); } else { + // get file by index int index = (int) ((offset / this.mappedFileSize) - (firstMappedFile.getFileFromOffset() / this.mappedFileSize)); MappedFile targetFile = null; try { @@ -717,6 +731,7 @@ public MappedFile findMappedFileByOffset(final long offset, final boolean return return targetFile; } + // iterate to find file for (MappedFile tmpMappedFile : this.mappedFiles) { if (offset >= tmpMappedFile.getFileFromOffset() && offset < tmpMappedFile.getFileFromOffset() + this.mappedFileSize) { diff --git a/store/src/main/java/org/apache/rocketmq/store/SelectMappedBufferResult.java b/store/src/main/java/org/apache/rocketmq/store/SelectMappedBufferResult.java index 5c38cfe92a9..b96dfd98882 100644 --- a/store/src/main/java/org/apache/rocketmq/store/SelectMappedBufferResult.java +++ b/store/src/main/java/org/apache/rocketmq/store/SelectMappedBufferResult.java @@ -19,6 +19,12 @@ import java.nio.ByteBuffer; import org.apache.rocketmq.store.logfile.MappedFile; +/** + * result while select mapped file + * - mapped file + * - offset and size + * - whether it is in memory + */ public class SelectMappedBufferResult { private final long startOffset; diff --git a/store/src/main/java/org/apache/rocketmq/store/pop/PopCheckPoint.java b/store/src/main/java/org/apache/rocketmq/store/pop/PopCheckPoint.java index 803ebc68957..e4ed5c085e8 100644 --- a/store/src/main/java/org/apache/rocketmq/store/pop/PopCheckPoint.java +++ b/store/src/main/java/org/apache/rocketmq/store/pop/PopCheckPoint.java @@ -21,24 +21,62 @@ import java.util.ArrayList; import java.util.List; +/** + * state check info for multi-messages pop from consume queue + */ public class PopCheckPoint implements Comparable { @JSONField(name = "so") private long startOffset; + /** + * pop time, which is the time when message is popped + * reviveTime = popTime + invisibleTime + */ @JSONField(name = "pt") private long popTime; + /** + * the invisible time of messages + * default is 60s, it can be changed by MQ client + */ @JSONField(name = "it") private long invisibleTime; + /** + * store ack states of messages + * one byte for each message + */ @JSONField(name = "bm") private int bitMap; + /** + * total number of messages + */ @JSONField(name = "n") private byte num; @JSONField(name = "q") private int queueId; @JSONField(name = "t") private String topic; + /** + * consumer group + */ private String cid; + /** + * revive offset, which is the consume queue offset of messageExt + */ @JSONField(name = "ro") private long reviveOffset; + /** + * Per-message offset differences from {@link #startOffset}. + * queueOffsetDiff will not be null or empty in 5.* + * + *

When a batch of messages is popped, the queue offsets of the messages may not + * be contiguous (e.g. batch messages, ConsumeQueue compaction, filter mismatch gaps). + * This list records {@code actualQueueOffset - startOffset} for each message in the + * batch, so that the system can correctly map an ack offset back to its index within + * the checkpoint via {@link #indexOfAck}, and reconstruct the original offset via + * {@link #ackOffsetByIndex}. + * + *

When this field is null or empty (old-version CK), offsets are assumed to be + * {@code startOffset + index}. + */ @JSONField(name = "d") private List queueOffsetDiff; @JSONField(name = "bn") @@ -165,12 +203,23 @@ public void addDiff(int diff) { this.queueOffsetDiff.add(diff); } + /** + * Map an ack offset to its index within the checkpoint batch. + * + *

The index is used to look up the corresponding bit in the {@link #bitMap} + * (or in {@code PopCheckPointWrapper.bits}) and to retrieve the original + * queue offset via {@link #ackOffsetByIndex}. + * + * @param ackOffset the queue offset being acked + * @return the sub-message index (0-based), or -1 if the offset is not found + * in this checkpoint + */ public int indexOfAck(long ackOffset) { if (ackOffset < startOffset) { return -1; } - // old version of checkpoint + // old version of checkpoint, this will not happen in 5.* if (queueOffsetDiff == null || queueOffsetDiff.isEmpty()) { if (ackOffset - startOffset < num) { @@ -184,8 +233,16 @@ public int indexOfAck(long ackOffset) { return queueOffsetDiff.indexOf((int) (ackOffset - startOffset)); } + /** + * get original queue offset by index. + * the method name is miss-leading, it should be getQueueOffsetByIndex. + * queueOffset = startOffset + queueOffsetDiff[index] + * + * @param index sub-message index within this checkpoint (0-based) + * @return the original queue offset in the consume queue + */ public long ackOffsetByIndex(byte index) { - // old version of checkpoint + // old version of checkpoint, this will not happen in 5.* if (queueOffsetDiff == null || queueOffsetDiff.isEmpty()) { return startOffset + index; } diff --git a/store/src/main/java/org/apache/rocketmq/store/rocksdb/MessageRocksDBStorage.java b/store/src/main/java/org/apache/rocketmq/store/rocksdb/MessageRocksDBStorage.java index d55596a293c..68844f095e1 100644 --- a/store/src/main/java/org/apache/rocketmq/store/rocksdb/MessageRocksDBStorage.java +++ b/store/src/main/java/org/apache/rocketmq/store/rocksdb/MessageRocksDBStorage.java @@ -57,19 +57,37 @@ import static org.apache.rocketmq.store.timer.rocksdb.TimerRocksDBRecord.TIMER_ROCKSDB_PUT; import static org.apache.rocketmq.store.timer.rocksdb.TimerRocksDBRecord.TIMER_ROCKSDB_UPDATE; +/** + * RocksDB-based storage engine for index, timer, and transaction data. + * + *

Manages three column families: + *

    + *
  • {@code default} — message index for topic-key-time queries
  • + *
  • {@code timer} — delayed/timer messages
  • + *
  • {@code trans} — transactional half-message index
  • + *
+ * The database directory is {@code ${storePathRootDir}/rocksdbstore}. + */ public class MessageRocksDBStorage extends AbstractRocksDBStorage { private static final Logger log = LoggerFactory.getLogger(LoggerName.STORE_LOGGER_NAME); private static final Logger logError = LoggerFactory.getLogger(LoggerName.STORE_ERROR_LOGGER_NAME); private static final String ROCKSDB_MESSAGE_DIRECTORY = "rocksdbstore"; + // Column family identifiers public static final byte[] TIMER_COLUMN_FAMILY = "timer".getBytes(StandardCharsets.UTF_8); public static final byte[] TRANS_COLUMN_FAMILY = "trans".getBytes(StandardCharsets.UTF_8); + + // Metadata keys stored inside each column family private static final byte[] LAST_OFFSET_PY = "lastOffsetPy".getBytes(StandardCharsets.UTF_8); private static final byte[] LAST_STORE_TIMESTAMP = "lastStoreTimeStamp".getBytes(StandardCharsets.UTF_8); + + // Suffix filled with 0xFF for range-delete upper bound private static final byte[] END_SUFFIX_BYTES = new byte[512]; static { Arrays.fill(END_SUFFIX_BYTES, (byte) 0xFF); } + + // Allowed checkpoint keys for the timer column family private static final Set COMMON_CHECK_POINT_KEY_SET_FOR_TIMER = new HashSet<>(); public static final byte[] SYS_TOPIC_SCAN_OFFSET_CHECK_POINT = "sys_topic_scan_offset_checkpoint".getBytes(StandardCharsets.UTF_8); public static final byte[] TIMELINE_CHECK_POINT = "timeline_checkpoint".getBytes(StandardCharsets.UTF_8); @@ -77,13 +95,18 @@ public class MessageRocksDBStorage extends AbstractRocksDBStorage { COMMON_CHECK_POINT_KEY_SET_FOR_TIMER.add(SYS_TOPIC_SCAN_OFFSET_CHECK_POINT); COMMON_CHECK_POINT_KEY_SET_FOR_TIMER.add(TIMELINE_CHECK_POINT); } + private static final byte[] DELETE_VAL_FLAG = new byte[] {(byte)0xFF}; private static final int LAST_OFFSET_PY_LENGTH = LAST_OFFSET_PY.length; private volatile ColumnFamilyHandle timerCFHandle; private volatile ColumnFamilyHandle transCFHandle; + // Periodically flush the timer WAL to keep recovery time bounded private final ScheduledExecutorService scheduler = Executors.newScheduledThreadPool(1); + + // Cache of recently deleted timer keys to prevent stale writes on update + // after a delete has been issued but not yet flushed private static final Cache DELETE_KEY_CACHE_FOR_TIMER = CacheBuilder.newBuilder() .maximumSize(10000) .expireAfterWrite(60, TimeUnit.MINUTES) @@ -99,6 +122,7 @@ protected boolean postLoad() { try { UtilAll.ensureDirOK(this.dbPath); initOptions(); + // Per-column-family options for different access patterns ColumnFamilyOptions indexCFOptions = RocksDBOptionsFactory.createIndexCFOptions(); ColumnFamilyOptions timerCFOptions = RocksDBOptionsFactory.createTimerCFOptions(); ColumnFamilyOptions transCFOptions = RocksDBOptionsFactory.createTransCFOptions(); @@ -114,6 +138,7 @@ protected boolean postLoad() { this.defaultCFHandle = cfHandles.get(0); this.timerCFHandle = cfHandles.get(1); this.transCFHandle = cfHandles.get(2); + // Periodically flush the timer WAL to cap recovery time scheduler.scheduleAtFixedRate(() -> { try { db.flush(flushOptions, timerCFHandle); @@ -145,6 +170,16 @@ protected void preShutdown() { log.info("MessageRocksDBStorage pre shutdown success, dbPath: {}", this.dbPath); } + /** + * Query physical offsets by topic, index type, key, and time range. + * + *

The index key is structured as {@code hour | KEY_SPLIT | topic | KEY_SPLIT | + * indexType | KEY_SPLIT | key | KEY_SPLIT | offsetPy}. + * Iterates hour-by-hour within {@code [beginTime, endTime]}, extracting + * the trailing offset bytes from each matching key. + * + * @param lastKey cursor for pagination, format "hour|topic|indexType|key|offsetPy" + */ public List queryOffsetForIndex(byte[] columnFamily, String topic, String indexType, String key, long beginTime, long endTime, int maxNum, String lastKey) { ColumnFamilyHandle cfHandle = getColumnFamily(columnFamily); if (null == cfHandle || StringUtils.isEmpty(topic) || StringUtils.isEmpty(indexType) || StringUtils.isEmpty(key) || beginTime < 0L || endTime <= 0L || beginTime > endTime || maxNum <= 0) { @@ -261,6 +296,10 @@ public void deleteRecordsForIndex(byte[] columnFamily, long storeTime, int hours } } + /** + * Batch-write index records and update the lastOffsetPy / lastStoreTimestamp + * metadata if the last record in the batch advances them. + */ public void writeRecordsForIndex(byte[] columnFamily, List recordList) { ColumnFamilyHandle cfHandle = getColumnFamily(columnFamily); if (null == cfHandle || CollectionUtils.isEmpty(recordList)) { @@ -332,6 +371,13 @@ private static Long getLastIndexTimeForIndex(String lastKey) { return null; } + /** + * Batch-write timer records supporting put, delete, and conditional update. + * + *

For {@code TIMER_ROCKSDB_UPDATE}, the write is skipped if the key was + * recently deleted (tracked in {@link #DELETE_KEY_CACHE_FOR_TIMER}) to + * avoid resurrecting a stale timer entry. + */ public void writeRecordsForTimer(byte[] columnFamily, List recordList) { ColumnFamilyHandle cfHandle = getColumnFamily(columnFamily); if (null == cfHandle || CollectionUtils.isEmpty(recordList)) { @@ -377,6 +423,12 @@ private static String getTimerCacheKey(long delayTime, String uniqKey) { return delayTime + ":" + uniqKey; } + /** + * Scan timer records in a time range {@code [lowerTime, upperTime)} using + * RocksDB iterate lower/upper bound for efficient range scan. + * + * @param startKey resume cursor from a previous scan; if null, scan from lowerTime + */ public List scanRecordsForTimer(byte[] columnFamily, long lowerTime, long upperTime, int size, byte[] startKey) { ColumnFamilyHandle cfHandle = getColumnFamily(columnFamily); if (null == cfHandle || lowerTime <= 0L || upperTime <= 0L || lowerTime > upperTime || size <= 0) { @@ -482,6 +534,14 @@ public void deleteCheckPointForTimer(byte[] columnFamily, byte[] key) { } } + /** + * Batch-write or delete transaction records. + * + *

Half-message records are put and track the maximum offsetPy for + * {@code LAST_OFFSET_PY}. OP (commit/rollback) records are deleted from + * the column family. The metadata key {@code LAST_OFFSET_PY} is updated + * atomically in the same write batch. + */ public void writeRecordsForTrans(byte[] columnFamily, List recordList) { ColumnFamilyHandle cfHandle = getColumnFamily(columnFamily); if (null == cfHandle || CollectionUtils.isEmpty(recordList)) { @@ -490,6 +550,7 @@ public void writeRecordsForTrans(byte[] columnFamily, List r long lastOffsetPy = 0L; try (WriteBatch writeBatch = new WriteBatch()) { for (TransRocksDBRecord record : recordList) { + // validate record if (null == record) { logError.error("MessageRocksDBStorage writeRecordsForTrans error, record is null"); continue; @@ -499,9 +560,10 @@ public void writeRecordsForTrans(byte[] columnFamily, List r logError.error("MessageRocksDBStorage writeRecordsForTrans param error, keyBytes: {}", keyBytes); continue; } - if (record.isOp()) { + + if (record.isOp()) { // delete if commit/rollback message writeBatch.delete(cfHandle, record.getKeyBytes()); - } else { + } else { // put if prepare message byte[] valueBytes = record.getValueBytes(); if (null == valueBytes || valueBytes.length == 0) { logError.error("MessageRocksDBStorage writeRecordsForTrans param error, valueBytes: {}", valueBytes); @@ -511,18 +573,25 @@ public void writeRecordsForTrans(byte[] columnFamily, List r lastOffsetPy = Math.max(lastOffsetPy, record.getOffsetPy()); } } + + // update last offsetPy if (lastOffsetPy > 0L) { Long lastOffsetPyStore = getLastOffsetPy(columnFamily); if (null == lastOffsetPyStore || lastOffsetPy > lastOffsetPyStore) { writeBatch.put(cfHandle, LAST_OFFSET_PY, ByteBuffer.allocate(Long.BYTES).putLong(lastOffsetPy).array()); } } + batchPut(ableWalWriteOptions, writeBatch); } catch (Exception e) { logError.error("MessageRocksDBStorage writeRecordsForTrans error: {}", e.getMessage()); } } + /** + * Update existing transaction records (e.g. increment check count). + * Entry deletion is also supported via {@link TransRocksDBRecord#isDelete()}. + */ public void updateRecordsForTrans(byte[] columnFamily, List recordList) { ColumnFamilyHandle cfHandle = getColumnFamily(columnFamily); if (null == cfHandle || CollectionUtils.isEmpty(recordList)) { @@ -552,6 +621,12 @@ public void updateRecordsForTrans(byte[] columnFamily, List } } + /** + * Scan transaction records from the column family. Skips the + * {@code LAST_OFFSET_PY} metadata key. + * + * @param startKey resume cursor; if null, scan from the first entry + */ public List scanRecordsForTrans(byte[] columnFamily, int size, byte[] startKey) { ColumnFamilyHandle cfHandle = getColumnFamily(columnFamily); if (null == cfHandle || size <= 0) { @@ -559,6 +634,7 @@ public List scanRecordsForTrans(byte[] columnFamily, int siz } RocksIterator iterator = null; try { + // init iterator iterator = db.newIterator(cfHandle); if (null == startKey || startKey.length == 0) { iterator.seekToFirst(); @@ -596,6 +672,10 @@ public List scanRecordsForTrans(byte[] columnFamily, int siz return null; } + /** + * Look up a single transaction record by key. Returns null if not found + * or if the value length does not match {@link TransRocksDBRecord#VALUE_LENGTH}. + */ public TransRocksDBRecord getRecordForTrans(byte[] columnFamily, TransRocksDBRecord transRocksDBRecord) { ColumnFamilyHandle cfHandle = getColumnFamily(columnFamily); if (null == cfHandle || null == transRocksDBRecord) { diff --git a/store/src/main/java/org/apache/rocketmq/store/timer/TimerMessageStore.java b/store/src/main/java/org/apache/rocketmq/store/timer/TimerMessageStore.java index 157f237f7b2..e84257766d4 100644 --- a/store/src/main/java/org/apache/rocketmq/store/timer/TimerMessageStore.java +++ b/store/src/main/java/org/apache/rocketmq/store/timer/TimerMessageStore.java @@ -1887,6 +1887,22 @@ public long getCongestNum(long deliverTimeMs) { return timerWheel.getNum(deliverTimeMs); } + /** + * Reject timer messages when the target time slot is congested. + * always return false with default config + * + *

Three tiers of flow control based on the number of messages already + * scheduled for the given delivery time: + *

    + *
  • ≤ {@code timerCongestNumEachSlot} — always admit
  • + *
  • between 1x and 2x — probabilistically reject with a linear + * increasing probability
  • + *
  • ≥ 2x — always reject
  • + *
+ * + * @param deliverTimeMs the target delivery timestamp + * @return {@code true} if the message should be rejected + */ public boolean isReject(long deliverTimeMs) { long congestNum = timerWheel.getNum(deliverTimeMs); if (congestNum <= storeConfig.getTimerCongestNumEachSlot()) { diff --git a/store/src/main/java/org/apache/rocketmq/store/transaction/TransMessageRocksDBStore.java b/store/src/main/java/org/apache/rocketmq/store/transaction/TransMessageRocksDBStore.java index 4166f2a3077..6816e93dfad 100644 --- a/store/src/main/java/org/apache/rocketmq/store/transaction/TransMessageRocksDBStore.java +++ b/store/src/main/java/org/apache/rocketmq/store/transaction/TransMessageRocksDBStore.java @@ -48,6 +48,20 @@ import org.rocksdb.RocksDBException; import static org.apache.rocketmq.store.rocksdb.MessageRocksDBStorage.TRANS_COLUMN_FAMILY; +/** + * RocksDB-based transactional half-message index store. + * + *

Replaces the CommitLog-based HALF/OP Topic approach with a RocksDB + * column family ({@link MessageRocksDBStorage#TRANS_COLUMN_FAMILY}). When + * a half or OP message is dispatched from the CommitLog, this store builds + * an index entry (or tombstone) in RocksDB, enabling O(1) state lookups + * during transaction check-back instead of scanning HALF/OP queues. + * + *

Index building is asynchronous: {@link #buildTransIndex} enqueues + * {@link TransRocksDBRecord}s into a bounded blocking queue, and a + * background {@link TransIndexBuildService} batches them into RocksDB + * writes. + */ public class TransMessageRocksDBStore implements CommitLogDispatchStore { private static final Logger log = LoggerFactory.getLogger(LoggerName.STORE_LOGGER_NAME); private static final Logger logError = LoggerFactory.getLogger(LoggerName.STORE_ERROR_LOGGER_NAME); @@ -64,8 +78,10 @@ public class TransMessageRocksDBStore implements CommitLogDispatchStore { private final MessageRocksDBStorage messageRocksDBStorage; private final BrokerStatsManager brokerStatsManager; private final SocketAddress storeHost; + // Thread-local buffer for reading messages from CommitLog, grown on demand private ThreadLocal bufferLocal = null; private TransIndexBuildService transIndexBuildService; + // Bounded queue decoupling CommitLog dispatch from RocksDB batch writes protected BlockingQueue originTransMsgQueue; public TransMessageRocksDBStore(final MessageStore messageStore, final BrokerStatsManager brokerStatsManager, final SocketAddress storeHost) { @@ -103,7 +119,20 @@ public void shutdown() { log.info("TransMessageRocksDBStore shutdown success"); } + /** + * CommitLog dispatch hook: build a transaction index record for half or OP messages. + * + *

Skips records whose CommitLog offset is already covered by the RocksDB + * {@code lastOffsetPy} watermark (idempotent on re-dispatch). + * + *

For {@code RMQ_SYS_ROCKSDB_TRANS_HALF_TOPIC}: creates a half-message + * record (offsetPy, topic, uniqKey, size, checkTimes=0). + * For {@code RMQ_SYS_ROCKSDB_TRANS_OP_HALF_TOPIC}: creates a tombstone + * record referencing the half-message's offset via + * {@code PROPERTY_TRANS_OFFSET}. + */ public void buildTransIndex(DispatchRequest dispatchRequest) { + // validate request and init context params if (null == dispatchRequest || dispatchRequest.getCommitLogOffset() < 0L || dispatchRequest.getMsgSize() <= 0 || state != RUNNING || null == this.originTransMsgQueue) { logError.error("TransMessageRocksDBStore buildTransIndex error, dispatchRequest: {}, state: {}, originTransMsgQueue: {}", dispatchRequest, state, originTransMsgQueue); return; @@ -119,6 +148,8 @@ public void buildTransIndex(DispatchRequest dispatchRequest) { int reqMsgSize = dispatchRequest.getMsgSize(); try { MessageExt msgInner = getMessage(reqOffsetPy, reqMsgSize); + + // parse and validate msgInner if (null == msgInner) { logError.error("TransMessageRocksDBStore buildTransIndex error, msgInner is not found, reqOffsetPy: {}, reqMsgSize: {}", reqOffsetPy, reqMsgSize); return; @@ -131,6 +162,7 @@ public void buildTransIndex(DispatchRequest dispatchRequest) { } TransRocksDBRecord transRocksDBRecord = null; String reqTopic = dispatchRequest.getTopic(); + if (TopicValidator.RMQ_SYS_ROCKSDB_TRANS_HALF_TOPIC.equals(reqTopic)) { transRocksDBRecord = new TransRocksDBRecord(reqOffsetPy, topic, uniqKey, reqMsgSize, 0); } else if (TopicValidator.RMQ_SYS_ROCKSDB_TRANS_OP_HALF_TOPIC.equals(reqTopic)) { @@ -148,6 +180,7 @@ public void buildTransIndex(DispatchRequest dispatchRequest) { logError.error("TransMessageRocksDBStore buildTransIndex error, transOffsetPy: {}, error: {}", transOffsetPy, e.getMessage()); } } + if (null != transRocksDBRecord) { while (!originTransMsgQueue.offer(transRocksDBRecord, 3, TimeUnit.SECONDS)) { if (System.currentTimeMillis() % 1000 == 0) { @@ -160,6 +193,13 @@ public void buildTransIndex(DispatchRequest dispatchRequest) { } } + /** + * Persist an OP (commit/rollback) message to CommitLog. The message is + * written to {@code RMQ_SYS_ROCKSDB_TRANS_OP_HALF_TOPIC} with + * {@code PROPERTY_TRANS_OFFSET} pointing back to the half message's + * commitLog offset. When the CommitLog dispatcher processes it, + * {@link #buildTransIndex} converts it into a RocksDB tombstone delete. + */ public void deletePrepareMessage(MessageExt messageExt) { if (null == messageExt) { logError.error("TransMessageRocksDBStore deletePrepareMessage error, messageExt is null"); @@ -214,6 +254,13 @@ public MessageRocksDBStorage getMessageRocksDBStorage() { return messageRocksDBStorage; } + /** + * Build an OP message for the RocksDB trans OP half topic. + * + *

Body is a single fill byte (actual data is in properties). + * {@code PROPERTY_TRANS_OFFSET} carries the half message's commitLog + * offset so {@link #buildTransIndex} can issue a RocksDB delete. + */ private MessageExtBrokerInner makeOpMessageInner(MessageExt messageExt) { if (null == messageExt) { logError.error("TransMessageRocksDBStore makeOpMessageInner messageExt is null"); @@ -246,6 +293,11 @@ private MessageExtBrokerInner makeOpMessageInner(MessageExt messageExt) { } } + /** + * Look up how many times a half message has been checked by the + * transaction checker. Returns null if the record does not exist + * (e.g. already committed/rolled back or never indexed). + */ public Integer getCheckTimes(String topic, String uniqKey, Long offsetPy) { if (StringUtils.isEmpty(topic) || StringUtils.isEmpty(uniqKey) || null == offsetPy || offsetPy < 0L) { return null; @@ -262,6 +314,13 @@ public Integer getCheckTimes(String topic, String uniqKey, Long offsetPy) { } } + /** + * Called during CommitLog recovery to decide whether a mapped file's + * data has already been indexed in RocksDB. + * + * @return true if the file's phyOffset is covered by the trans column + * family's lastOffsetPy watermark, meaning no re-dispatch needed. + */ public boolean isMappedFileMatchedRecover(long phyOffset, long storeTimestamp, boolean recoverNormally) throws RocksDBException { if (!storeConfig.isTransRocksDBEnable()) { @@ -287,6 +346,14 @@ private String getServiceThreadName() { return brokerIdentifier; } + /** + * Background service that drains {@link #originTransMsgQueue} in batches + * and writes them to the trans column family via + * {@link MessageRocksDBStorage#writeRecordsForTrans}. + * + *

Polls up to {@link #BATCH_SIZE} records per iteration. Continues + * draining even after shutdown is requested to avoid data loss. + */ public class TransIndexBuildService extends ServiceThread { private final Logger log = TransMessageRocksDBStore.log; private List trs; @@ -345,6 +412,11 @@ protected void pollTransMessageRecords() { } } + /** + * Returns the starting CommitLog phy offset from which dispatch should + * resume after a restart. Reads the {@code lastOffsetPy} watermark from + * the trans column family. + */ @Override public Long getDispatchFromPhyOffset(boolean recoverNormally) throws RocksDBException { if (!storeConfig.isTransRocksDBEnable()) { diff --git a/store/src/main/java/org/apache/rocketmq/store/transaction/TransRocksDBRecord.java b/store/src/main/java/org/apache/rocketmq/store/transaction/TransRocksDBRecord.java index 099f6150939..2f85449a332 100644 --- a/store/src/main/java/org/apache/rocketmq/store/transaction/TransRocksDBRecord.java +++ b/store/src/main/java/org/apache/rocketmq/store/transaction/TransRocksDBRecord.java @@ -23,19 +23,52 @@ import org.apache.rocketmq.logging.org.slf4j.Logger; import org.apache.rocketmq.logging.org.slf4j.LoggerFactory; +/** + * Represents a single transaction record stored in the RocksDB trans column family. + * + *

Key format: {@code offsetPy + "@" + topic + "@" + uniqKey}
+ * Value format: {@code [checkTimes (int) | sizePy (int)]} — 8 bytes total + * + *

Two record types share this structure: + *

    + *
  • Half record ({@code isOp=false}): persisted as a RocksDB put with + * the full value. Used to track a pending transaction for status checking.
  • + *
  • OP record ({@code isOp=true}): persisted as a RocksDB delete + * (tombstone). Created when a commit/rollback OP message is dispatched, + * erasing the corresponding half record from the column family.
  • + *
+ */ public class TransRocksDBRecord { private static final Logger logError = LoggerFactory.getLogger(LoggerName.STORE_ERROR_LOGGER_NAME); public static final int VALUE_LENGTH = Integer.BYTES + Integer.BYTES; private static final String KEY_SPLIT = "@"; + + // CommitLog physical offset prefix — primary sort key for RocksDB ordering protected long offsetPy; + // Real business topic of the transaction private String topic; + // Unique transaction ID (UniqID / transactionId) private String uniqKey; + // How many times the transaction checker has probed this record private int checkTimes = 0; + // Message body size in CommitLog, used to read the original message on check private int sizePy; + // True if this record came from an OP message (commit/rollback tombstone) private boolean isOp; + // True if the record should be deleted after exceeding max check times private boolean delete; + // Transient holder for the deserialized MessageExt, not persisted private MessageExt messageExt; + /** + * Create a half record for initial indexing from CommitLog dispatch. + * + * @param offsetPy CommitLog phy offset + * @param topic original business topic + * @param uniqKey transaction ID + * @param sizePy message body size in CommitLog + * @param checkTimes initial check counter (usually 0) + */ public TransRocksDBRecord(long offsetPy, String topic, String uniqKey, int sizePy, int checkTimes) { this.offsetPy = offsetPy; this.topic = topic; @@ -44,6 +77,12 @@ public TransRocksDBRecord(long offsetPy, String topic, String uniqKey, int sizeP this.checkTimes = checkTimes; } + /** + * Create an OP record referencing a half record by its offsetPy. + * + *

When {@code isOp=true}, this record signals a RocksDB delete + * (tombstone) — no value is persisted. + */ public TransRocksDBRecord(long offsetPy, String topic, String uniqKey, boolean isOp) { this.offsetPy = offsetPy; this.topic = topic; @@ -53,6 +92,12 @@ public TransRocksDBRecord(long offsetPy, String topic, String uniqKey, boolean i public TransRocksDBRecord() {} + /** + * get rocksDB record key: + * offsetPy + KEY_SPLIT + topic + KEY_SPLIT + uniqKey(transactionId) + * + * @return get key bytes + */ public byte[] getKeyBytes() { if (offsetPy < 0L || StringUtils.isEmpty(topic) || StringUtils.isEmpty(uniqKey)) { return null; @@ -62,6 +107,12 @@ public byte[] getKeyBytes() { return ByteBuffer.allocate(keyLength).putLong(offsetPy).put(keySuffixBytes).array(); } + /** + * Serialize value as {@code [checkTimes (4 bytes) | sizePy (4 bytes)]}. + * Total encoded length is {@link #VALUE_LENGTH} (8 bytes). + * + * @return encoded value bytes, or null if checkTimes or sizePy is invalid + */ public byte[] getValueBytes() { if (checkTimes < 0 || sizePy <= 0) { logError.error("TransRocksDBRecord getValueBytes error, checkTimes: {}, sizePy: {}", checkTimes, sizePy); @@ -70,6 +121,12 @@ public byte[] getValueBytes() { return ByteBuffer.allocate(VALUE_LENGTH).putInt(checkTimes).putInt(sizePy).array(); } + /** + * Deserialize a record from its RocksDB key-value pair. + * + *

Key layout: {@code [offsetPy (8 bytes)][suffix ("@" + topic + "@" + uniqKey)]}
+ * Value layout: {@code [checkTimes (4 bytes) | sizePy (4 bytes)]} + */ public static TransRocksDBRecord decode(byte[] key, byte[] value) { if (null == key || key.length <= Long.BYTES || null == value || value.length != VALUE_LENGTH) { logError.error("TransRocksDBRecord decode param error, key: {}, value: {}", key, value);