From afd66f2fd3a3a14dfec989c4b2c98a5be4046947 Mon Sep 17 00:00:00 2001 From: Mark Salyzyn Date: Mon, 19 Mar 2018 15:16:29 -0700 Subject: llkd: bootstat: propagate detailed livelock canonical boot reason Report kernel_panic,sysrq,livelock, reboot reason via last dmesg (pstore console). Add ro.llk.killtest property, which will allow reliable ABA platforms to drop kill test and go directly to kernel panic. This should also allow some manual unit testing of the canonical boot reason report. New canonical boot reasons from llkd are: - kernel_panic,sysrq,livelock,alarm llkd itself locked up (Hail Mary) - kernel_panic,sysrq,livelock,driver uninterrruptible D state - kernel_panic,sysrq,livelock,zombie uninterrruptible Z state Manual test assumptions: - llkd is built by the platform and landed on system partition - unit test is built and landed in /data/nativetest (could land in /data/nativetest64, adjust test correspondingly) - llkd not enabled, ro.llk.enable and ro.llk.killtest are not set by platform allowing test to adjust all the configuration properties and start llkd. - or, llkd is enabled, ro.llk.enable is true, and killtest is disabled, ro.llk.killtest is false, setup by the platform. This breaks the go/apct generic operations of the unit test for llk.zombie and llk.driver as kernel panic results requiring manual intervention otherwise. If test moves to go/apct, then we will be forced to bypass these tests under this condition (but allow them to run if ro.llk.killtest is "off" so specific testing above/below can be run). for i in driver zombie; do adb shell su root setprop ro.llk.killtest off adb shell /data/nativetest/llkd_unit_test/llkd_unit_test --gtest_filter=llkd.${i} adb wait-for-device adb shell su root setprop ro.llk.killtest off sleep 60 adb shell getprop sys.boot.reason adb shell /data/nativetest/llkd_unit_test/llkd_unit_test --gtest_filter=llkd.${i} done Test: llkd_unit_test (see test assumptions) Bug: 33808187 Bug: 72838192 Change-Id: I2b24875376ddfdbc282ba3da5c5b3567de85dbc0 --- llkd/README.md | 11 +++-------- llkd/include/llkd.h | 2 ++ llkd/libllkd.cpp | 23 ++++++++++++++--------- llkd/llkd.rc | 1 + llkd/tests/llkd_test.cpp | 29 +++++++++++++++++++++++++++++ 5 files changed, 49 insertions(+), 17 deletions(-) (limited to 'llkd') diff --git a/llkd/README.md b/llkd/README.md index 71319c8af..b2ba2a2f6 100644 --- a/llkd/README.md +++ b/llkd/README.md @@ -53,7 +53,9 @@ on purpose, and llkd effectively sweeps up processes that create these conditions. If the test can, it will reconfigure llkd to expedite the test duration by adjusting the ro.llk.* Android properties. Tests run the D state with some scheduling progress to ensure that ABA checking prevents false -triggers. +triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be +set to false; however this will result in some of the unit tests to panic +kernel instead of deal with more graceful kill operation. Android Properties ------------------ @@ -108,13 +110,6 @@ default , comma separated list of uid numbers or names. Architectural Concerns ---------------------- -- Figure out how to communicate the kernel panic better to bootstat canonical - boot reason determination. This may require an alteration to bootstat, or - some logging from llkd. Would like to see boot reason to be - watchdog,livelock as a minimum requirement. Or more specifically would want - watchdog,livelock,device or watchdog,livelock,zombie be reported. - Currently reports panic,sysrq (user requested panic) or panic depending on - system support of pstore. - Create kernel module and associated gTest to actually test panic. - Create gTest to test out blacklist (ro.llk.blacklist. generally not be inputs). Could require more test-only interfaces to libllkd. diff --git a/llkd/include/llkd.h b/llkd/include/llkd.h index bd0739bb0..e3ae4bbd8 100644 --- a/llkd/include/llkd.h +++ b/llkd/include/llkd.h @@ -37,6 +37,8 @@ unsigned llkCheckMilliseconds(void); #define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY #define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall" #define LLK_MLOCKALL_DEFAULT true +#define LLK_KILLTEST_PROPERTY "ro.llk.killtest" +#define LLK_KILLTEST_DEFAULT true #define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms" #define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout" #define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms" diff --git a/llkd/libllkd.cpp b/llkd/libllkd.cpp index d82810572..f357cc2ca 100644 --- a/llkd/libllkd.cpp +++ b/llkd/libllkd.cpp @@ -70,6 +70,7 @@ milliseconds llkCycle; // ms to next thread check bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled bool llkRunning = false; // thread is running bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked +bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout enum { llkStateD, llkStateZ, llkNumStates }; // state indexes milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state @@ -292,7 +293,7 @@ struct proc { exeMissingValid(false), cmdlineValid(false), updated(true), - killed(false) { + killed(!llkTestWithKill) { memset(comm, '\0', sizeof(comm)); setComm(_comm); } @@ -475,8 +476,8 @@ bool llkWriteStringToFileConfirm(const std::string& string, const std::string& f return android::base::Trim(content) == string; } -void llkPanicKernel(bool dump, pid_t tid) __noreturn; -void llkPanicKernel(bool dump, pid_t tid) { +void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn; +void llkPanicKernel(bool dump, pid_t tid, const char* state) { auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger"); if (sysrqTriggerFd < 0) { // DYB @@ -496,6 +497,8 @@ void llkPanicKernel(bool dump, pid_t tid) { } ::usleep(200000); // let everything settle } + llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n", + "/dev/kmsg"); android::base::WriteStringToFd("c", sysrqTriggerFd); // NOTREACHED // DYB @@ -507,7 +510,7 @@ void llkPanicKernel(bool dump, pid_t tid) { } void llkAlarmHandler(int) { - llkPanicKernel(false, ::getpid()); + llkPanicKernel(false, ::getpid(), "alarm"); } milliseconds GetUintProperty(const std::string& key, milliseconds def) { @@ -686,7 +689,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { (val != procp->nrSwitches)) { procp->nrSwitches = val; procp->count = 0ms; - procp->killed = false; + procp->killed = !llkTestWithKill; } return; } @@ -700,7 +703,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { if (schedUpdate != procp->schedUpdate) { procp->schedUpdate = schedUpdate; procp->count = 0ms; - procp->killed = false; + procp->killed = !llkTestWithKill; } } @@ -709,7 +712,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) { if (static_cast(val) != procp->nrSwitches) { procp->nrSwitches = val; procp->count = 0ms; - procp->killed = false; + procp->killed = !llkTestWithKill; } } } @@ -719,6 +722,7 @@ void llkLogConfig(void) { << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n" << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n" << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n" + << LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n" << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n" << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n" << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n" @@ -869,7 +873,7 @@ milliseconds llkCheck(bool checkRunning) { procp->time = utime + stime; if (procp->state != state) { procp->count = 0ms; - procp->killed = false; + procp->killed = !llkTestWithKill; procp->state = state; } else { procp->count += llkCycle; @@ -973,7 +977,7 @@ milliseconds llkCheck(bool checkRunning) { // We are here because we have confirmed kernel live-lock LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid << "->" << tid << ' ' << procp->getComm() << " [panic]"; - llkPanicKernel(true, tid); + llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver"); } LOG(VERBOSE) << "+closedir()"; } @@ -1045,6 +1049,7 @@ bool llkInit(const char* threadname) { } khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable); llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall); + llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill); // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value. khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout); diff --git a/llkd/llkd.rc b/llkd/llkd.rc index f762a5ced..e538cdb91 100644 --- a/llkd/llkd.rc +++ b/llkd/llkd.rc @@ -44,5 +44,6 @@ service llkd /system/bin/llkd user llkd group llkd readproc capabilities KILL IPC_LOCK + file /dev/kmsg w file /proc/sysrq-trigger w writepid /dev/cpuset/system-background/tasks diff --git a/llkd/tests/llkd_test.cpp b/llkd/tests/llkd_test.cpp index 2de18205c..3a15ff1e3 100644 --- a/llkd/tests/llkd_test.cpp +++ b/llkd/tests/llkd_test.cpp @@ -154,6 +154,27 @@ inline void waitForPid(pid_t child_pid) { ASSERT_EQ(WTERMSIG(wstatus), SIGKILL); } +bool checkKill(const char* reason) { + if (android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, LLK_KILLTEST_DEFAULT)) { + return false; + } + auto bootreason = android::base::GetProperty("sys.boot.reason", "nothing"); + if (bootreason == reason) { + GTEST_LOG_INFO << "Expected test result confirmed " << reason << "\n"; + return true; + } + GTEST_LOG_WARNING << "Expected test result is " << reason << "\n"; + + // apct adjustment if needed (set LLK_KILLTEST_PROPERTY to "off" to allow test) + // + // if (android::base::GetProperty(LLK_KILLTEST_PROPERTY, "") == "false") { + // GTEST_LOG_WARNING << "Bypassing test\n"; + // return true; + // } + + return false; +} + } // namespace // The tests that use this helper are to simulate processes stuck in 'D' @@ -221,6 +242,10 @@ TEST(llkd, driver_ABA_glacial) { // is that llkd will perform kill mitigation and not progress to kernel_panic. TEST(llkd, zombie) { + if (checkKill("kernel_panic,sysrq,livelock,zombie")) { + return; + } + const auto period = llkdSleepPeriod('Z'); /* Create a Persistent Zombie Process */ @@ -241,6 +266,10 @@ TEST(llkd, zombie) { } TEST(llkd, driver) { + if (checkKill("kernel_panic,sysrq,livelock,driver")) { + return; + } + const auto period = llkdSleepPeriod('D'); /* Create a Persistent Device Process */ -- cgit v1.2.3-54-g00ecf