summaryrefslogtreecommitdiffstats
path: root/llkd
diff options
context:
space:
mode:
authorMark Salyzyn2018-03-19 17:16:29 -0500
committerMark Salyzyn2018-04-18 16:02:16 -0500
commitafd66f2fd3a3a14dfec989c4b2c98a5be4046947 (patch)
tree5ac3fce5c884c24328745a586a4e2634e06e8223 /llkd
parentd035dbbecf44191af398261b722d72026678eea8 (diff)
downloadplatform-system-core-afd66f2fd3a3a14dfec989c4b2c98a5be4046947.tar.gz
platform-system-core-afd66f2fd3a3a14dfec989c4b2c98a5be4046947.tar.xz
platform-system-core-afd66f2fd3a3a14dfec989c4b2c98a5be4046947.zip
llkd: bootstat: propagate detailed livelock canonical boot reason
Report kernel_panic,sysrq,livelock,<state> reboot reason via last dmesg (pstore console). Add ro.llk.killtest property, which will allow reliable ABA platforms to drop kill test and go directly to kernel panic. This should also allow some manual unit testing of the canonical boot reason report. New canonical boot reasons from llkd are: - kernel_panic,sysrq,livelock,alarm llkd itself locked up (Hail Mary) - kernel_panic,sysrq,livelock,driver uninterrruptible D state - kernel_panic,sysrq,livelock,zombie uninterrruptible Z state Manual test assumptions: - llkd is built by the platform and landed on system partition - unit test is built and landed in /data/nativetest (could land in /data/nativetest64, adjust test correspondingly) - llkd not enabled, ro.llk.enable and ro.llk.killtest are not set by platform allowing test to adjust all the configuration properties and start llkd. - or, llkd is enabled, ro.llk.enable is true, and killtest is disabled, ro.llk.killtest is false, setup by the platform. This breaks the go/apct generic operations of the unit test for llk.zombie and llk.driver as kernel panic results requiring manual intervention otherwise. If test moves to go/apct, then we will be forced to bypass these tests under this condition (but allow them to run if ro.llk.killtest is "off" so specific testing above/below can be run). for i in driver zombie; do adb shell su root setprop ro.llk.killtest off adb shell /data/nativetest/llkd_unit_test/llkd_unit_test --gtest_filter=llkd.${i} adb wait-for-device adb shell su root setprop ro.llk.killtest off sleep 60 adb shell getprop sys.boot.reason adb shell /data/nativetest/llkd_unit_test/llkd_unit_test --gtest_filter=llkd.${i} done Test: llkd_unit_test (see test assumptions) Bug: 33808187 Bug: 72838192 Change-Id: I2b24875376ddfdbc282ba3da5c5b3567de85dbc0
Diffstat (limited to 'llkd')
-rw-r--r--llkd/README.md11
-rw-r--r--llkd/include/llkd.h2
-rw-r--r--llkd/libllkd.cpp23
-rw-r--r--llkd/llkd.rc1
-rw-r--r--llkd/tests/llkd_test.cpp29
5 files changed, 49 insertions, 17 deletions
diff --git a/llkd/README.md b/llkd/README.md
index 71319c8af..b2ba2a2f6 100644
--- a/llkd/README.md
+++ b/llkd/README.md
@@ -53,7 +53,9 @@ on purpose, and llkd effectively sweeps up processes that create these
53conditions. If the test can, it will reconfigure llkd to expedite the test 53conditions. If the test can, it will reconfigure llkd to expedite the test
54duration by adjusting the ro.llk.* Android properties. Tests run the D state 54duration by adjusting the ro.llk.* Android properties. Tests run the D state
55with some scheduling progress to ensure that ABA checking prevents false 55with some scheduling progress to ensure that ABA checking prevents false
56triggers. 56triggers. If 100% reliable ABA on platform, then ro.llk.killtest can be
57set to false; however this will result in some of the unit tests to panic
58kernel instead of deal with more graceful kill operation.
57 59
58Android Properties 60Android Properties
59------------------ 61------------------
@@ -108,13 +110,6 @@ default <empty>, comma separated list of uid numbers or names.
108Architectural Concerns 110Architectural Concerns
109---------------------- 111----------------------
110 112
111- Figure out how to communicate the kernel panic better to bootstat canonical
112 boot reason determination. This may require an alteration to bootstat, or
113 some logging from llkd. Would like to see boot reason to be
114 watchdog,livelock as a minimum requirement. Or more specifically would want
115 watchdog,livelock,device or watchdog,livelock,zombie be reported.
116 Currently reports panic,sysrq (user requested panic) or panic depending on
117 system support of pstore.
118- Create kernel module and associated gTest to actually test panic. 113- Create kernel module and associated gTest to actually test panic.
119- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally 114- Create gTest to test out blacklist (ro.llk.blacklist.<properties> generally
120 not be inputs). Could require more test-only interfaces to libllkd. 115 not be inputs). Could require more test-only interfaces to libllkd.
diff --git a/llkd/include/llkd.h b/llkd/include/llkd.h
index bd0739bb0..e3ae4bbd8 100644
--- a/llkd/include/llkd.h
+++ b/llkd/include/llkd.h
@@ -37,6 +37,8 @@ unsigned llkCheckMilliseconds(void);
37#define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY 37#define KHT_ENABLE_PROPERTY "ro." KHT_ENABLE_WRITEABLE_PROPERTY
38#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall" 38#define LLK_MLOCKALL_PROPERTY "ro.llk.mlockall"
39#define LLK_MLOCKALL_DEFAULT true 39#define LLK_MLOCKALL_DEFAULT true
40#define LLK_KILLTEST_PROPERTY "ro.llk.killtest"
41#define LLK_KILLTEST_DEFAULT true
40#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms" 42#define LLK_TIMEOUT_MS_PROPERTY "ro.llk.timeout_ms"
41#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout" 43#define KHT_TIMEOUT_PROPERTY "ro.khungtask.timeout"
42#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms" 44#define LLK_D_TIMEOUT_MS_PROPERTY "ro.llk.D.timeout_ms"
diff --git a/llkd/libllkd.cpp b/llkd/libllkd.cpp
index d82810572..f357cc2ca 100644
--- a/llkd/libllkd.cpp
+++ b/llkd/libllkd.cpp
@@ -70,6 +70,7 @@ milliseconds llkCycle; // ms to next thread check
70bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled 70bool llkEnable = LLK_ENABLE_DEFAULT; // llk daemon enabled
71bool llkRunning = false; // thread is running 71bool llkRunning = false; // thread is running
72bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked 72bool llkMlockall = LLK_MLOCKALL_DEFAULT; // run mlocked
73bool llkTestWithKill = LLK_KILLTEST_DEFAULT; // issue test kills
73milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout 74milliseconds llkTimeoutMs = LLK_TIMEOUT_MS_DEFAULT; // default timeout
74enum { llkStateD, llkStateZ, llkNumStates }; // state indexes 75enum { llkStateD, llkStateZ, llkNumStates }; // state indexes
75milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state 76milliseconds llkStateTimeoutMs[llkNumStates]; // timeout override for each detection state
@@ -292,7 +293,7 @@ struct proc {
292 exeMissingValid(false), 293 exeMissingValid(false),
293 cmdlineValid(false), 294 cmdlineValid(false),
294 updated(true), 295 updated(true),
295 killed(false) { 296 killed(!llkTestWithKill) {
296 memset(comm, '\0', sizeof(comm)); 297 memset(comm, '\0', sizeof(comm));
297 setComm(_comm); 298 setComm(_comm);
298 } 299 }
@@ -475,8 +476,8 @@ bool llkWriteStringToFileConfirm(const std::string& string, const std::string& f
475 return android::base::Trim(content) == string; 476 return android::base::Trim(content) == string;
476} 477}
477 478
478void llkPanicKernel(bool dump, pid_t tid) __noreturn; 479void llkPanicKernel(bool dump, pid_t tid, const char* state) __noreturn;
479void llkPanicKernel(bool dump, pid_t tid) { 480void llkPanicKernel(bool dump, pid_t tid, const char* state) {
480 auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger"); 481 auto sysrqTriggerFd = llkFileToWriteFd("/proc/sysrq-trigger");
481 if (sysrqTriggerFd < 0) { 482 if (sysrqTriggerFd < 0) {
482 // DYB 483 // DYB
@@ -496,6 +497,8 @@ void llkPanicKernel(bool dump, pid_t tid) {
496 } 497 }
497 ::usleep(200000); // let everything settle 498 ::usleep(200000); // let everything settle
498 } 499 }
500 llkWriteStringToFile(std::string("SysRq : Trigger a crash : 'livelock,") + state + "'\n",
501 "/dev/kmsg");
499 android::base::WriteStringToFd("c", sysrqTriggerFd); 502 android::base::WriteStringToFd("c", sysrqTriggerFd);
500 // NOTREACHED 503 // NOTREACHED
501 // DYB 504 // DYB
@@ -507,7 +510,7 @@ void llkPanicKernel(bool dump, pid_t tid) {
507} 510}
508 511
509void llkAlarmHandler(int) { 512void llkAlarmHandler(int) {
510 llkPanicKernel(false, ::getpid()); 513 llkPanicKernel(false, ::getpid(), "alarm");
511} 514}
512 515
513milliseconds GetUintProperty(const std::string& key, milliseconds def) { 516milliseconds GetUintProperty(const std::string& key, milliseconds def) {
@@ -686,7 +689,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
686 (val != procp->nrSwitches)) { 689 (val != procp->nrSwitches)) {
687 procp->nrSwitches = val; 690 procp->nrSwitches = val;
688 procp->count = 0ms; 691 procp->count = 0ms;
689 procp->killed = false; 692 procp->killed = !llkTestWithKill;
690 } 693 }
691 return; 694 return;
692 } 695 }
@@ -700,7 +703,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
700 if (schedUpdate != procp->schedUpdate) { 703 if (schedUpdate != procp->schedUpdate) {
701 procp->schedUpdate = schedUpdate; 704 procp->schedUpdate = schedUpdate;
702 procp->count = 0ms; 705 procp->count = 0ms;
703 procp->killed = false; 706 procp->killed = !llkTestWithKill;
704 } 707 }
705 } 708 }
706 709
@@ -709,7 +712,7 @@ void llkCheckSchedUpdate(proc* procp, const std::string& piddir) {
709 if (static_cast<uint64_t>(val) != procp->nrSwitches) { 712 if (static_cast<uint64_t>(val) != procp->nrSwitches) {
710 procp->nrSwitches = val; 713 procp->nrSwitches = val;
711 procp->count = 0ms; 714 procp->count = 0ms;
712 procp->killed = false; 715 procp->killed = !llkTestWithKill;
713 } 716 }
714 } 717 }
715} 718}
@@ -719,6 +722,7 @@ void llkLogConfig(void) {
719 << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n" 722 << LLK_ENABLE_PROPERTY "=" << llkFormat(llkEnable) << "\n"
720 << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n" 723 << KHT_ENABLE_PROPERTY "=" << llkFormat(khtEnable) << "\n"
721 << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n" 724 << LLK_MLOCKALL_PROPERTY "=" << llkFormat(llkMlockall) << "\n"
725 << LLK_KILLTEST_PROPERTY "=" << llkFormat(llkTestWithKill) << "\n"
722 << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n" 726 << KHT_TIMEOUT_PROPERTY "=" << llkFormat(khtTimeout) << "\n"
723 << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n" 727 << LLK_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkTimeoutMs) << "\n"
724 << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n" 728 << LLK_D_TIMEOUT_MS_PROPERTY "=" << llkFormat(llkStateTimeoutMs[llkStateD]) << "\n"
@@ -869,7 +873,7 @@ milliseconds llkCheck(bool checkRunning) {
869 procp->time = utime + stime; 873 procp->time = utime + stime;
870 if (procp->state != state) { 874 if (procp->state != state) {
871 procp->count = 0ms; 875 procp->count = 0ms;
872 procp->killed = false; 876 procp->killed = !llkTestWithKill;
873 procp->state = state; 877 procp->state = state;
874 } else { 878 } else {
875 procp->count += llkCycle; 879 procp->count += llkCycle;
@@ -973,7 +977,7 @@ milliseconds llkCheck(bool checkRunning) {
973 // We are here because we have confirmed kernel live-lock 977 // We are here because we have confirmed kernel live-lock
974 LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid 978 LOG(ERROR) << state << ' ' << llkFormat(procp->count) << ' ' << ppid << "->" << pid
975 << "->" << tid << ' ' << procp->getComm() << " [panic]"; 979 << "->" << tid << ' ' << procp->getComm() << " [panic]";
976 llkPanicKernel(true, tid); 980 llkPanicKernel(true, tid, (state == 'Z') ? "zombie" : "driver");
977 } 981 }
978 LOG(VERBOSE) << "+closedir()"; 982 LOG(VERBOSE) << "+closedir()";
979 } 983 }
@@ -1045,6 +1049,7 @@ bool llkInit(const char* threadname) {
1045 } 1049 }
1046 khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable); 1050 khtEnable = android::base::GetBoolProperty(KHT_ENABLE_PROPERTY, khtEnable);
1047 llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall); 1051 llkMlockall = android::base::GetBoolProperty(LLK_MLOCKALL_PROPERTY, llkMlockall);
1052 llkTestWithKill = android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, llkTestWithKill);
1048 // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set 1053 // if LLK_TIMOUT_MS_PROPERTY was not set, we will use a set
1049 // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value. 1054 // KHT_TIMEOUT_PROPERTY as co-operative guidance for the default value.
1050 khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout); 1055 khtTimeout = GetUintProperty(KHT_TIMEOUT_PROPERTY, khtTimeout);
diff --git a/llkd/llkd.rc b/llkd/llkd.rc
index f762a5ced..e538cdb91 100644
--- a/llkd/llkd.rc
+++ b/llkd/llkd.rc
@@ -44,5 +44,6 @@ service llkd /system/bin/llkd
44 user llkd 44 user llkd
45 group llkd readproc 45 group llkd readproc
46 capabilities KILL IPC_LOCK 46 capabilities KILL IPC_LOCK
47 file /dev/kmsg w
47 file /proc/sysrq-trigger w 48 file /proc/sysrq-trigger w
48 writepid /dev/cpuset/system-background/tasks 49 writepid /dev/cpuset/system-background/tasks
diff --git a/llkd/tests/llkd_test.cpp b/llkd/tests/llkd_test.cpp
index 2de18205c..3a15ff1e3 100644
--- a/llkd/tests/llkd_test.cpp
+++ b/llkd/tests/llkd_test.cpp
@@ -154,6 +154,27 @@ inline void waitForPid(pid_t child_pid) {
154 ASSERT_EQ(WTERMSIG(wstatus), SIGKILL); 154 ASSERT_EQ(WTERMSIG(wstatus), SIGKILL);
155} 155}
156 156
157bool checkKill(const char* reason) {
158 if (android::base::GetBoolProperty(LLK_KILLTEST_PROPERTY, LLK_KILLTEST_DEFAULT)) {
159 return false;
160 }
161 auto bootreason = android::base::GetProperty("sys.boot.reason", "nothing");
162 if (bootreason == reason) {
163 GTEST_LOG_INFO << "Expected test result confirmed " << reason << "\n";
164 return true;
165 }
166 GTEST_LOG_WARNING << "Expected test result is " << reason << "\n";
167
168 // apct adjustment if needed (set LLK_KILLTEST_PROPERTY to "off" to allow test)
169 //
170 // if (android::base::GetProperty(LLK_KILLTEST_PROPERTY, "") == "false") {
171 // GTEST_LOG_WARNING << "Bypassing test\n";
172 // return true;
173 // }
174
175 return false;
176}
177
157} // namespace 178} // namespace
158 179
159// The tests that use this helper are to simulate processes stuck in 'D' 180// The tests that use this helper are to simulate processes stuck in 'D'
@@ -221,6 +242,10 @@ TEST(llkd, driver_ABA_glacial) {
221// is that llkd will perform kill mitigation and not progress to kernel_panic. 242// is that llkd will perform kill mitigation and not progress to kernel_panic.
222 243
223TEST(llkd, zombie) { 244TEST(llkd, zombie) {
245 if (checkKill("kernel_panic,sysrq,livelock,zombie")) {
246 return;
247 }
248
224 const auto period = llkdSleepPeriod('Z'); 249 const auto period = llkdSleepPeriod('Z');
225 250
226 /* Create a Persistent Zombie Process */ 251 /* Create a Persistent Zombie Process */
@@ -241,6 +266,10 @@ TEST(llkd, zombie) {
241} 266}
242 267
243TEST(llkd, driver) { 268TEST(llkd, driver) {
269 if (checkKill("kernel_panic,sysrq,livelock,driver")) {
270 return;
271 }
272
244 const auto period = llkdSleepPeriod('D'); 273 const auto period = llkdSleepPeriod('D');
245 274
246 /* Create a Persistent Device Process */ 275 /* Create a Persistent Device Process */