Skip to content

Commit e292aa9

Browse files
joshkang97meta-codesync[bot]
authored andcommitted
Allow skipping lmax index and filter block prefetches for external file ingestion
Summary: Bulk external file ingestion can spend significant commit time prefetching index/filter blocks for large Lmax files. This adds `IngestExternalFileOptions::prefetch_lmax_index_and_filter_blocks` (default true) so bulk-load callers can defer that metadata prefetch when the cache doesn't need immediate warming. The option threads through `FileMetaData::skip_index_and_filter_blocks_prefetch` to drive the existing `LoadTableHandlers()` prefetch logic. Benchmarks show ~40% commit latency reduction (3459 → 2112 μs) and ~55% lower IO time (2560 → 1145 μs) when disabled. Benchmarks: ``` db_bench --benchmarks=ingestexternalfile --num=2200000 --ingest_external_file_num_batches=1 --ingest_external_file_batch_size=1 --ingest_external_file_use_file_info=true --ingest_external_file_fill_cache=true --cache_index_and_filter_blocks=true --bloom_bits=10 --statistics=true --stats_level=3 --ingest_external_file_prefetch_lmax_index_and_filter_blocks=<true|false> ``` | `ingest_external_file_prefetch_lmax_index_and_filter_blocks` | `rocksdb.ingest.external.file.run.micros` | `rocksdb.table.open.io.micros` | index/filter cache adds | last-level read bytes | | `true` | `3459 us` | `2560 us` | `1 / 1` | `3551559` | | `false` | `2112 us` | `1145 us` | `0 / 0` | `1333` | Reviewed By: xingbowang Differential Revision: D108678511 fbshipit-source-id: e8951e3aad36cc0accffb33ecc3cc1b4aeb89459
1 parent 123030f commit e292aa9

9 files changed

Lines changed: 116 additions & 1 deletion

File tree

‎db/external_sst_file_ingestion_job.cc‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ Status ExternalSstFileIngestionJob::Prepare(
4444
IngestedFileInfo file_to_ingest;
4545
// For temperature, first assume it matches provided hint
4646
file_to_ingest.file_temperature = file_temperature;
47+
file_to_ingest.prefetch_lmax_index_and_filter_blocks =
48+
ingestion_options_.prefetch_lmax_index_and_filter_blocks;
4749
const PreparedFileInfo* prepared_file_info =
4850
file_infos.empty() ? nullptr : file_infos[i];
4951
status = GetIngestedFileInfo(file_path, next_file_number++,
@@ -748,6 +750,9 @@ Status ExternalSstFileIngestionJob::AssignLevelsForOneBatch(
748750
tail_size, file->user_defined_timestamps_persisted, "", "");
749751
f_metadata.temperature = file->file_temperature;
750752
f_metadata.marked_for_compaction = marked_for_compaction;
753+
f_metadata.skip_index_and_filter_blocks_prefetch =
754+
!file->prefetch_lmax_index_and_filter_blocks &&
755+
file->picked_level == cfd_->NumberLevels() - 1;
751756
// Extract min/max timestamps from table properties for UDT support.
752757
// This ensures ingested files have proper timestamp ranges in FileMetaData,
753758
// similar to files created by flush and compaction.

‎db/external_sst_file_ingestion_job.h‎

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,9 @@ struct IngestedFileInfo : public KeyRangeInfo {
181181
// setting.
182182
bool user_defined_timestamps_persisted = true;
183183

184+
// Whether Lmax commit-time table opening should prefetch index/filter blocks.
185+
bool prefetch_lmax_index_and_filter_blocks = true;
186+
184187
SequenceNumber largest_seqno = kMaxSequenceNumber;
185188
SequenceNumber smallest_seqno = kMaxSequenceNumber;
186189
};

‎db/external_sst_file_test.cc‎

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -465,6 +465,85 @@ TEST_F(ExternalSSTFileTest, ParallelFileOpenWithFileOpeningThreads) {
465465
}
466466
}
467467

468+
TEST_F(ExternalSSTFileTest, LmaxPrefetchSkipDoesNotDisableL0Prefetch) {
469+
LRUCacheOptions co;
470+
co.capacity = 32 << 20;
471+
std::shared_ptr<Cache> cache = NewLRUCache(co);
472+
BlockBasedTableOptions table_options;
473+
table_options.block_cache = cache;
474+
table_options.cache_index_and_filter_blocks = true;
475+
table_options.filter_policy.reset(NewBloomFilterPolicy(10));
476+
477+
Options options = CurrentOptions();
478+
options.disable_auto_compactions = true;
479+
options.max_open_files = -1;
480+
options.num_levels = 2;
481+
options.optimize_filters_for_hits = false;
482+
options.statistics = CreateDBStatistics();
483+
options.table_factory.reset(NewBlockBasedTableFactory(table_options));
484+
DestroyAndReopen(options);
485+
486+
const auto write_file = [&](const std::string& file_path,
487+
const std::string& value,
488+
ExternalSstFileInfo* file_info) {
489+
SstFileWriter writer(EnvOptions(), options);
490+
ASSERT_OK(writer.Open(file_path));
491+
ASSERT_OK(writer.Put(Key(10), value));
492+
ASSERT_OK(writer.Finish(file_info));
493+
};
494+
495+
const auto ingest_file = [&](const std::string& file_path,
496+
const ExternalSstFileInfo& file_info,
497+
bool fail_if_not_bottommost_level) {
498+
IngestExternalFileArg arg;
499+
arg.column_family = db_->DefaultColumnFamily();
500+
arg.external_files = {file_path};
501+
arg.file_infos = {file_info.prepared_file_info.get()};
502+
arg.options.fail_if_not_bottommost_level = fail_if_not_bottommost_level;
503+
arg.options.verify_checksums_before_ingest = false;
504+
arg.options.prefetch_lmax_index_and_filter_blocks = false;
505+
ASSERT_OK(db_->IngestExternalFiles({arg}));
506+
};
507+
508+
const std::string lmax_file_path = sst_files_dir_ + "lazy_lmax.sst";
509+
ExternalSstFileInfo lmax_file_info;
510+
write_file(lmax_file_path, "v10", &lmax_file_info);
511+
512+
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
513+
Tickers::BLOCK_CACHE_INDEX_ADD));
514+
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
515+
Tickers::BLOCK_CACHE_FILTER_ADD));
516+
517+
ingest_file(lmax_file_path, lmax_file_info,
518+
true /* fail_if_not_bottommost_level */);
519+
ASSERT_EQ("0,1", FilesPerLevel());
520+
521+
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
522+
Tickers::BLOCK_CACHE_INDEX_ADD));
523+
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
524+
Tickers::BLOCK_CACHE_FILTER_ADD));
525+
526+
const std::string l0_file_path = sst_files_dir_ + "l0.sst";
527+
ExternalSstFileInfo l0_file_info;
528+
write_file(l0_file_path, "v11", &l0_file_info);
529+
530+
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
531+
Tickers::BLOCK_CACHE_INDEX_ADD));
532+
ASSERT_EQ(0, options.statistics->getAndResetTickerCount(
533+
Tickers::BLOCK_CACHE_FILTER_ADD));
534+
535+
ingest_file(l0_file_path, l0_file_info,
536+
false /* fail_if_not_bottommost_level */);
537+
ASSERT_EQ("1,1", FilesPerLevel());
538+
539+
EXPECT_GT(options.statistics->getAndResetTickerCount(
540+
Tickers::BLOCK_CACHE_INDEX_ADD),
541+
0);
542+
EXPECT_GT(options.statistics->getAndResetTickerCount(
543+
Tickers::BLOCK_CACHE_FILTER_ADD),
544+
0);
545+
}
546+
468547
TEST_F(ExternalSSTFileTest, AbortPreparedIngestion) {
469548
Options options = CurrentOptions();
470549
DestroyAndReopen(options);

‎db/version_edit.h‎

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -337,6 +337,11 @@ struct FileMetaData {
337337
// via FileOptions::file_metadata on subsequent opens. Empty if not available.
338338
std::string file_open_metadata;
339339

340+
// Skips prefetching index and filter blocks into block cache on file open,
341+
// not persisted in MANIFEST. NOTE: false does not guarantee prefetching
342+
// either (e.g. when index and filter blocks are not cached in block cache).
343+
bool skip_index_and_filter_blocks_prefetch = false;
344+
340345
FileMetaData() = default;
341346

342347
FileMetaData(uint64_t file, uint32_t file_path_id, uint64_t file_size,

‎db/version_util.cc‎

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,11 +62,14 @@ Status LoadTableHandlersHelper(
6262

6363
TableCache::TypedHandle* handle = nullptr;
6464
TableReader* table_reader = nullptr;
65+
const bool prefetch_index_and_filter_for_file =
66+
prefetch_index_and_filter_in_cache &&
67+
!file_meta->skip_index_and_filter_blocks_prefetch;
6568
auto status = table_cache->FindTable(
6669
read_options, file_options, internal_comparator, *file_meta, &handle,
6770
mutable_cf_options, &table_reader, false /* no_io */,
6871
internal_stats->GetFileReadHist(level), false /* skip_filters */,
69-
level, prefetch_index_and_filter_in_cache,
72+
level, prefetch_index_and_filter_for_file,
7073
max_file_size_for_l0_meta_pin, file_meta->temperature,
7174
true /* pin_table_handle */);
7275

‎db_stress_tool/no_batched_ops_stress.cc‎

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2511,6 +2511,8 @@ class NonBatchedOpsStressTest : public StressTest {
25112511
thread->rand.OneInOpt(2) ? 1024 * 1024 : 0;
25122512
ingest_options.fill_cache = thread->rand.OneInOpt(4);
25132513
ingest_options.file_opening_threads = 1 + thread->rand.Uniform(4);
2514+
ingest_options.prefetch_lmax_index_and_filter_blocks =
2515+
!thread->rand.OneInOpt(4);
25142516
const bool use_prepare_commit = thread->rand.OneInOpt(
25152517
FLAGS_ingest_external_file_prepare_commit_one_in);
25162518
const bool use_separate_prepare_calls = use_prepare_commit &&
@@ -2524,6 +2526,8 @@ class NonBatchedOpsStressTest : public StressTest {
25242526
<< ", fill_cache: " << ingest_options.fill_cache
25252527
<< ", file_opening_threads: "
25262528
<< ingest_options.file_opening_threads
2529+
<< ", prefetch_lmax_index_and_filter_blocks: "
2530+
<< ingest_options.prefetch_lmax_index_and_filter_blocks
25272531
<< ", ingest_external_file_data_file_count: "
25282532
<< data_file_count
25292533
<< ", num_external_files: " << external_files.size()

‎include/rocksdb/options.h‎

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2952,6 +2952,13 @@ struct IngestExternalFileOptions {
29522952
// ingestion options.
29532953
bool fill_cache = true;
29542954

2955+
// Controls whether external file ingestion should prefetch index and filter
2956+
// blocks while opening table readers during commit. Setting this to false can
2957+
// reduce commit latency for bulk loads into Lmax when
2958+
// (BlockBasedTableOptions::cache_index_and_filter_blocks=true or partitioned
2959+
// filters/indexes are enabled).
2960+
bool prefetch_lmax_index_and_filter_blocks = true;
2961+
29552962
// Maximum number of threads used to open table readers for the files being
29562963
// ingested during commit, can speed up ingestion performance, when ingesting
29572964
// multiple files at once.

‎tools/db_bench_tool.cc‎

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1352,6 +1352,12 @@ DEFINE_bool(ingest_external_file_fill_cache,
13521352
"If true, the ingestexternalfile benchmark allows file ingestion "
13531353
"reads to populate block cache.");
13541354

1355+
DEFINE_bool(ingest_external_file_prefetch_lmax_index_and_filter_blocks,
1356+
ROCKSDB_NAMESPACE::IngestExternalFileOptions()
1357+
.prefetch_lmax_index_and_filter_blocks,
1358+
"If true, the ingestexternalfile benchmark prefetches index and "
1359+
"filter blocks while opening table readers during commit.");
1360+
13551361
DEFINE_uint64(
13561362
initial_auto_readahead_size,
13571363
ROCKSDB_NAMESPACE::BlockBasedTableOptions().initial_auto_readahead_size,
@@ -9710,6 +9716,8 @@ class Benchmark {
97109716
ingest_options.move_files = true;
97119717
ingest_options.file_opening_threads = file_opening_threads;
97129718
ingest_options.fill_cache = FLAGS_ingest_external_file_fill_cache;
9719+
ingest_options.prefetch_lmax_index_and_filter_blocks =
9720+
FLAGS_ingest_external_file_prefetch_lmax_index_and_filter_blocks;
97139721
if (use_file_info) {
97149722
// Reuse the writer's metadata so ingestion skips re-opening/scanning.
97159723
IngestExternalFileArg arg;
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Reduced commit latency for large external file ingestions into the last level by adding `IngestExternalFileOptions::prefetch_lmax_index_and_filter_blocks`, which can skip commit-time index and filter block prefetching for cache-backed table metadata.

0 commit comments

Comments
 (0)