Skip to content

Commit 214869a

Browse files
xingbowangmeta-codesync[bot]
authored andcommitted
Persist fault injection logs and fail fast on expected-state trace writes (#14651)
Summary: - switch fault injection error recording from an in-memory ring buffer to per-run fixed-record binary logs under `TEST_TMPDIR/fault_injection_logs` (or `/tmp/fault_injection_logs`) so crash paths survive DB reopen cleanup - keep the raw and decoded fault logs for external artifact collection/cleanup, and make `db_crashtest` print consistent blackbox/whitebox summaries after decoding - make expected-state tracing fail fast on trace write failures and document offline trace inspection via `trace_analyzer` - add coverage for binary log persistence/decoding/truncated-tail handling and keep info logs excluded from fault injection Reviewed By: hx235 Differential Revision: D101973626 fbshipit-source-id: fdcb5b6370cf92a046e09b8d3391e80eecb66c23
1 parent 5bf7818 commit 214869a

13 files changed

Lines changed: 1090 additions & 937 deletions

‎db_stress_tool/db_stress_gflags.cc‎

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -706,20 +706,6 @@ DEFINE_string(
706706
"only tracked when --sync_fault_injection is set. See --seed and "
707707
"--nooverwritepercent for further requirements.");
708708

709-
DEFINE_bool(expected_state_trace_debug, true,
710-
"If true, print debug logs while replaying expected-state trace "
711-
"records during crash recovery verification.");
712-
713-
DEFINE_int64(
714-
expected_state_trace_debug_key, -1,
715-
"If non-negative, restrict expected-state trace debug logs to the "
716-
"specified logical key where possible. Raw-key roundtrip mismatches for "
717-
"that logical key are still logged.");
718-
719-
DEFINE_int32(expected_state_trace_debug_max_logs, 200,
720-
"Maximum number of expected-state trace debug log lines to emit "
721-
"per restore attempt.");
722-
723709
DEFINE_bool(verify_checksum, false,
724710
"Verify checksum for every block read from storage");
725711

‎db_stress_tool/db_stress_test_base.cc‎

Lines changed: 30 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
// found in the LICENSE file. See the AUTHORS file for names of contributors.
99
//
1010

11+
#include <cstdlib>
1112
#include <ios>
1213
#include <memory>
1314
#include <mutex>
@@ -30,6 +31,7 @@
3031
#include "db_stress_tool/db_stress_wide_merge_operator.h"
3132
#include "file/file_util.h"
3233
#include "options/options_parser.h"
34+
#include "port/port.h"
3335
#include "rocksdb/convenience.h"
3436
#include "rocksdb/filter_policy.h"
3537
#include "rocksdb/io_dispatcher.h"
@@ -258,6 +260,32 @@ bool NeedsFaultInjection() {
258260
FLAGS_write_fault_one_in || FLAGS_sync_fault_injection;
259261
}
260262

263+
std::string GetFaultInjectionLogBaseDir() {
264+
if (!FLAGS_env_uri.empty() || !FLAGS_fs_uri.empty()) {
265+
return "/tmp";
266+
}
267+
// db_stress reads environment variables during single-threaded startup,
268+
// before worker threads are created.
269+
// NOLINTNEXTLINE(concurrency-mt-unsafe)
270+
const char* test_tmpdir = std::getenv("TEST_TMPDIR");
271+
return test_tmpdir != nullptr && test_tmpdir[0] != '\0' ? test_tmpdir
272+
: "/tmp";
273+
}
274+
275+
std::string GetFaultInjectionLogPath(const std::string& db_label) {
276+
const std::string log_dir =
277+
GetFaultInjectionLogBaseDir() + "/fault_injection_logs";
278+
Status s = Env::Default()->CreateDirIfMissing(log_dir);
279+
if (!s.ok()) {
280+
fprintf(stderr, "Failed to create directory %s: %s\n", log_dir.c_str(),
281+
s.ToString().c_str());
282+
exit(1);
283+
}
284+
return log_dir + "/fault_injection_" + std::to_string(port::GetProcessID()) +
285+
"_" + std::to_string(Env::Default()->NowMicros()) + "_" + db_label +
286+
".bin";
287+
}
288+
261289
} // namespace
262290

263291
const std::string& StressTest::GetDbLabel() const { return db_label_; }
@@ -285,7 +313,8 @@ StressTest::StressTest(int db_index, const std::string& db_path,
285313
std::make_shared<DbStressFSWrapper>(raw_env->GetFileSystem())),
286314
db_fault_injection_fs_(
287315
NeedsFaultInjection()
288-
? std::make_shared<FaultInjectionTestFS>(db_stress_fs_)
316+
? std::make_shared<FaultInjectionTestFS>(
317+
db_stress_fs_, GetFaultInjectionLogPath(db_label_))
289318
: nullptr),
290319
db_env_(std::make_unique<CompositeEnvWrapper>(
291320
raw_env, db_fault_injection_fs_
@@ -314,23 +343,6 @@ StressTest::StressTest(int db_index, const std::string& db_path,
314343
// StressTest::Open() for open fault injection and in RunStressTestImpl()
315344
// for proper fault injection setup.
316345
db_fault_injection_fs_->SetFilesystemDirectWritable(true);
317-
318-
// Set the fault injection log file path so that PrintAll() writes to a
319-
// file instead of stderr (signal-safe). PrintAll() opens this path with
320-
// plain POSIX open(), not through raw_env, so the log path must stay on
321-
// the local filesystem. Store it in the local test directory (TEST_TMPDIR
322-
// via Env::Default()) outside the DB directory so it survives DB reopen
323-
// and gets included in the sandcastle db.tar.gz artifact for post-failure
324-
// analysis.
325-
std::string log_dir;
326-
if (!Env::Default()->GetTestDirectory(&log_dir).ok() || log_dir.empty()) {
327-
log_dir = "/tmp";
328-
}
329-
std::string log_path = log_dir + "/fault_injection_" +
330-
std::to_string(getpid()) + "_" +
331-
std::to_string(Env::Default()->NowMicros()) + "_" +
332-
GetDbLabel() + ".log";
333-
db_fault_injection_fs_->SetInjectedErrorLogPath(log_path);
334346
}
335347

336348
if (FLAGS_destroy_db_initially) {

‎db_stress_tool/db_stress_tool.cc‎

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ static std::shared_ptr<ROCKSDB_NAMESPACE::Env> legacy_env_wrapper_guard;
3838
// Raw pointers for signal-safe crash callback. Signal handlers can only
3939
// access file-static/global variables; can't capture StressTest instances.
4040
static std::vector<ROCKSDB_NAMESPACE::FaultInjectionTestFS*>
41-
fault_fs_for_crash_report;
41+
fault_fs_for_crash_flush;
4242

4343
int ValidateNumDbsFlags() {
4444
if (FLAGS_num_dbs < 1) {
@@ -88,20 +88,20 @@ int DestroyAllDbs() {
8888

8989
void RegisterCrashCallbacks(
9090
const std::vector<std::unique_ptr<StressTest>>& stress_tests, int num_dbs) {
91-
fault_fs_for_crash_report.resize(num_dbs, nullptr);
91+
fault_fs_for_crash_flush.resize(num_dbs, nullptr);
9292
bool any_fault_fs = false;
9393
for (int i = 0; i < num_dbs; i++) {
94-
fault_fs_for_crash_report[i] =
94+
fault_fs_for_crash_flush[i] =
9595
stress_tests[i]->GetDbFaultInjectionFs().get();
96-
if (fault_fs_for_crash_report[i]) {
96+
if (fault_fs_for_crash_flush[i]) {
9797
any_fault_fs = true;
9898
}
9999
}
100100
if (any_fault_fs) {
101101
port::RegisterCrashCallback([]() {
102-
for (auto* fs : fault_fs_for_crash_report) {
102+
for (auto* fs : fault_fs_for_crash_flush) {
103103
if (fs) {
104-
fs->PrintRecentInjectedErrors();
104+
fs->FlushRecentInjectedErrors();
105105
}
106106
}
107107
});
@@ -633,7 +633,7 @@ int db_stress_tool(int argc, char** argv) {
633633
}
634634
stress_tests[i]->CleanUp();
635635
}
636-
for (auto& fs : fault_fs_for_crash_report) {
636+
for (auto& fs : fault_fs_for_crash_flush) {
637637
fs = nullptr;
638638
}
639639
return all_passed ? 0 : 1;

0 commit comments

Comments
 (0)