pacemaker  1.1.24-3850484742
Scalable High-Availability cluster resource manager
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-2019 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU Lesser General Public License
7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #include <sched.h>
13 #include <sys/ioctl.h>
14 #include <sys/reboot.h>
15 
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #include <ctype.h>
20 #include <dirent.h>
21 #include <signal.h>
22 
23 #ifdef _POSIX_MEMLOCK
24 # include <sys/mman.h>
25 #endif
26 
27 static int sbd_pid = 0;
28 
30 {
35 };
36 
37 #define SYSRQ "/proc/sys/kernel/sysrq"
38 
39 void
41 {
42 #if SUPPORT_PROCFS
43  static bool need_init = true;
44  FILE* procf;
45  int c;
46 
47  if(need_init) {
48  need_init = false;
49  } else {
50  return;
51  }
52 
53  procf = fopen(SYSRQ, "r");
54  if (!procf) {
55  crm_perror(LOG_WARNING, "Cannot open "SYSRQ" for read");
56  return;
57  }
58  if (fscanf(procf, "%d", &c) != 1) {
59  crm_perror(LOG_ERR, "Parsing "SYSRQ" failed");
60  c = 0;
61  }
62  fclose(procf);
63  if (c == 1)
64  return;
65 
66  /* 8 for debugging dumps of processes, 128 for reboot/poweroff */
67  c |= 136;
68  procf = fopen(SYSRQ, "w");
69  if (!procf) {
70  crm_perror(LOG_ERR, "Cannot write to "SYSRQ);
71  return;
72  }
73  fprintf(procf, "%d", c);
74  fclose(procf);
75 #endif // SUPPORT_PROCFS
76  return;
77 }
78 
79 static void
80 sysrq_trigger(char t)
81 {
82 #if SUPPORT_PROCFS
83  FILE *procf;
84 
85  sysrq_init();
86 
87  procf = fopen("/proc/sysrq-trigger", "a");
88  if (!procf) {
89  crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
90  return;
91  }
92  crm_info("sysrq-trigger: %c", t);
93  fprintf(procf, "%c\n", t);
94  fclose(procf);
95 #endif // SUPPORT_PROCFS
96  return;
97 }
98 
99 
100 static void
101 pcmk_panic_local(void)
102 {
103  int rc = pcmk_ok;
104  uid_t uid = geteuid();
105  pid_t ppid = getppid();
106 
107  if(uid != 0 && ppid > 1) {
108  /* We're a non-root pacemaker daemon (cib, crmd, pengine,
109  * attrd, etc) with the original pacemakerd parent
110  *
111  * Of these, only crmd is likely to be initiating resets
112  */
113  do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid);
115  return;
116 
117  } else if (uid != 0) {
118 #if SUPPORT_PROCFS
119  /*
120  * No permissions, and no pacemakerd parent to escalate to.
121  * Track down the new pacemakerd process and send a signal instead.
122  */
123  union sigval signal_value;
124 
125  memset(&signal_value, 0, sizeof(signal_value));
126  ppid = crm_procfs_pid_of("pacemakerd");
127  do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid);
128 
129  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
130  crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid);
131  }
132 #endif // SUPPORT_PROCFS
133 
134  /* The best we can do now is die */
136  return;
137  }
138 
139  /* We're either pacemakerd, or a pacemaker daemon running as root */
140 
141  if (safe_str_eq("crash", getenv("PCMK_panic_action"))) {
142  sysrq_trigger('c');
143  } else {
144  sysrq_trigger('b');
145  }
146  /* reboot(RB_HALT_SYSTEM); rc = errno; */
147  reboot(RB_AUTOBOOT);
148  rc = errno;
149 
150  do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc);
151 
152  if(ppid > 1) {
153  /* child daemon */
154  exit(pcmk_err_panic);
155  } else {
156  /* pacemakerd or orphan child */
157  exit(DAEMON_RESPAWN_STOP);
158  }
159 }
160 
161 static void
162 pcmk_panic_sbd(void)
163 {
164  union sigval signal_value;
165  pid_t ppid = getppid();
166 
167  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid);
168 
169  memset(&signal_value, 0, sizeof(signal_value));
170  /* TODO: Arrange for a slightly less brutal option? */
171  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
172  crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid);
173  pcmk_panic_local();
174  }
175 
176  if(ppid > 1) {
177  /* child daemon */
178  exit(pcmk_err_panic);
179  } else {
180  /* pacemakerd or orphan child */
181  exit(DAEMON_RESPAWN_STOP);
182  }
183 }
184 
185 void
186 pcmk_panic(const char *origin)
187 {
188  static struct qb_log_callsite *panic_cs = NULL;
189 
190  if (panic_cs == NULL) {
191  panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
192  }
193 
194  /* Ensure sbd_pid is set */
195  (void)pcmk_locate_sbd();
196 
197  if (panic_cs && panic_cs->targets) {
198  /* getppid() == 1 means our original parent no longer exists */
199  do_crm_log_always(LOG_EMERG,
200  "Shutting down instead of panicking the node: origin=%s, sbd=%d, parent=%d",
201  origin, sbd_pid, getppid());
203  return;
204  }
205 
206  if(sbd_pid > 1) {
207  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin);
208  pcmk_panic_sbd();
209 
210  } else {
211  do_crm_log_always(LOG_EMERG, "Panicking the system directly: %s", origin);
212  pcmk_panic_local();
213  }
214 }
215 
216 pid_t
218 {
219  char *pidfile = NULL;
220  char *sbd_path = NULL;
221 
222  if(sbd_pid > 1) {
223  return sbd_pid;
224  }
225 
226  /* Look for the pid file */
227  pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
228  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
229 
230  /* Read the pid file */
231  CRM_ASSERT(pidfile);
232 
233  sbd_pid = crm_pidfile_inuse(pidfile, 0, sbd_path);
234  if(sbd_pid > 0) {
235  crm_trace("SBD detected at pid=%d (file)", sbd_pid);
236 
237 #if SUPPORT_PROCFS
238  } else {
239  /* Fall back to /proc for systems that support it */
240  sbd_pid = crm_procfs_pid_of("sbd");
241  crm_trace("SBD detected at pid=%d (proc)", sbd_pid);
242 #endif // SUPPORT_PROCFS
243  }
244 
245  if(sbd_pid < 0) {
246  sbd_pid = 0;
247  crm_trace("SBD not detected");
248  }
249 
250  free(pidfile);
251  free(sbd_path);
252 
253  return sbd_pid;
254 }
255 
256 long
258 {
259  const char *env_value = getenv("SBD_WATCHDOG_TIMEOUT");
260  long sbd_timeout = crm_get_msec(env_value);
261 
262  return sbd_timeout;
263 }
264 
265 gboolean
266 check_sbd_timeout(const char *value)
267 {
268  long st_timeout = value? crm_get_msec(value) : 0;
269 
270  if (st_timeout <= 0) {
271  crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
272  value? value : "default");
273 
274  } else if (pcmk_locate_sbd() == 0) {
275  do_crm_log_always(LOG_EMERG,
276  "Shutting down: stonith-watchdog-timeout configured (%s) but SBD not active",
277  value);
279  return FALSE;
280 
281  } else {
282  long sbd_timeout = crm_get_sbd_timeout();
283 
284  if (st_timeout < sbd_timeout) {
285  do_crm_log_always(LOG_EMERG,
286  "Shutting down: stonith-watchdog-timeout (%s) too short (must be >%ldms)",
287  value, sbd_timeout);
289  return FALSE;
290  }
291  crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
292  value, sbd_timeout);
293  }
294  return TRUE;
295 }
#define LOG_TRACE
Definition: logging.h:29
long crm_pidfile_inuse(const char *filename, long mypid, const char *daemon)
Definition: utils.c:826
const char * pcmk_strerror(int rc)
Definition: logging.c:1017
void sysrq_init(void)
Definition: watchdog.c:40
pid_t pcmk_locate_sbd(void)
Definition: watchdog.c:217
#define pcmk_ok
Definition: error.h:45
long long crm_get_msec(const char *input)
Definition: utils.c:589
unsigned int crm_trace_nonlog
Definition: logging.c:50
pcmk_panic_flags
Definition: watchdog.c:29
long crm_get_sbd_timeout(void)
Definition: watchdog.c:257
#define crm_debug(fmt, args...)
Definition: logging.h:279
#define crm_trace(fmt, args...)
Definition: logging.h:280
#define PCMK_RUN_DIR
Definition: config.h:678
#define do_crm_log_always(level, fmt, args...)
Log a message using constant severity.
Definition: logging.h:239
#define pcmk_err_panic
Definition: error.h:71
#define DAEMON_RESPAWN_STOP
Definition: crm.h:55
#define SBIN_DIR
Definition: config.h:697
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:252
gboolean check_sbd_timeout(const char *value)
Definition: watchdog.c:266
#define CRM_ASSERT(expr)
Definition: error.h:20
int crm_exit(int rc)
Definition: utils.c:74
#define SYSRQ
Definition: watchdog.c:37
#define safe_str_eq(a, b)
Definition: util.h:74
int crm_procfs_pid_of(const char *name)
Definition: procfs.c:118
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
void pcmk_panic(const char *origin)
Definition: watchdog.c:186
#define crm_info(fmt, args...)
Definition: logging.h:277