001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.util; 020 021import java.io.File; 022import java.io.IOException; 023import java.util.ArrayList; 024import java.util.Arrays; 025import java.util.Timer; 026import java.util.TimerTask; 027 028import org.apache.commons.logging.Log; 029import org.apache.commons.logging.LogFactory; 030import org.apache.hadoop.conf.Configuration; 031import org.apache.hadoop.fs.FileUtil; 032import org.apache.hadoop.service.AbstractService; 033import org.apache.hadoop.util.Shell.ExitCodeException; 034import org.apache.hadoop.util.Shell.ShellCommandExecutor; 035import org.apache.hadoop.util.Shell; 036import org.apache.hadoop.util.StringUtils; 037 038/** 039 * 040 * The class which provides functionality of checking the health of the node 041 * using the configured node health script and reporting back to the service 042 * for which the health checker has been asked to report. 043 */ 044public class NodeHealthScriptRunner extends AbstractService { 045 046 private static Log LOG = LogFactory.getLog(NodeHealthScriptRunner.class); 047 048 /** Absolute path to the health script. */ 049 private String nodeHealthScript; 050 /** Delay after which node health script to be executed */ 051 private long intervalTime; 052 /** Time after which the script should be timedout */ 053 private long scriptTimeout; 054 /** Timer used to schedule node health monitoring script execution */ 055 private Timer nodeHealthScriptScheduler; 056 057 /** ShellCommandExecutor used to execute monitoring script */ 058 ShellCommandExecutor shexec = null; 059 060 /** Pattern used for searching in the output of the node health script */ 061 static private final String ERROR_PATTERN = "ERROR"; 062 063 /** Time out error message */ 064 public static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = "Node health script timed out"; 065 066 private boolean isHealthy; 067 068 private String healthReport; 069 070 private long lastReportedTime; 071 072 private TimerTask timer; 073 074 private enum HealthCheckerExitStatus { 075 SUCCESS, 076 TIMED_OUT, 077 FAILED_WITH_EXIT_CODE, 078 FAILED_WITH_EXCEPTION, 079 FAILED 080 } 081 082 083 /** 084 * Class which is used by the {@link Timer} class to periodically execute the 085 * node health script. 086 * 087 */ 088 private class NodeHealthMonitorExecutor extends TimerTask { 089 090 String exceptionStackTrace = ""; 091 092 public NodeHealthMonitorExecutor(String[] args) { 093 ArrayList<String> execScript = new ArrayList<String>(); 094 execScript.add(nodeHealthScript); 095 if (args != null) { 096 execScript.addAll(Arrays.asList(args)); 097 } 098 shexec = new ShellCommandExecutor(execScript 099 .toArray(new String[execScript.size()]), null, null, scriptTimeout); 100 } 101 102 @Override 103 public void run() { 104 HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS; 105 try { 106 shexec.execute(); 107 } catch (ExitCodeException e) { 108 // ignore the exit code of the script 109 status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE; 110 // On Windows, we will not hit the Stream closed IOException 111 // thrown by stdout buffered reader for timeout event. 112 if (Shell.WINDOWS && shexec.isTimedOut()) { 113 status = HealthCheckerExitStatus.TIMED_OUT; 114 } 115 } catch (Exception e) { 116 LOG.warn("Caught exception : " + e.getMessage()); 117 if (!shexec.isTimedOut()) { 118 status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION; 119 } else { 120 status = HealthCheckerExitStatus.TIMED_OUT; 121 } 122 exceptionStackTrace = StringUtils.stringifyException(e); 123 } finally { 124 if (status == HealthCheckerExitStatus.SUCCESS) { 125 if (hasErrors(shexec.getOutput())) { 126 status = HealthCheckerExitStatus.FAILED; 127 } 128 } 129 reportHealthStatus(status); 130 } 131 } 132 133 /** 134 * Method which is used to parse output from the node health monitor and 135 * send to the report address. 136 * 137 * The timed out script or script which causes IOException output is 138 * ignored. 139 * 140 * The node is marked unhealthy if 141 * <ol> 142 * <li>The node health script times out</li> 143 * <li>The node health scripts output has a line which begins with ERROR</li> 144 * <li>An exception is thrown while executing the script</li> 145 * </ol> 146 * If the script throws {@link IOException} or {@link ExitCodeException} the 147 * output is ignored and node is left remaining healthy, as script might 148 * have syntax error. 149 * 150 * @param status 151 */ 152 void reportHealthStatus(HealthCheckerExitStatus status) { 153 long now = System.currentTimeMillis(); 154 switch (status) { 155 case SUCCESS: 156 setHealthStatus(true, "", now); 157 break; 158 case TIMED_OUT: 159 setHealthStatus(false, NODE_HEALTH_SCRIPT_TIMED_OUT_MSG); 160 break; 161 case FAILED_WITH_EXCEPTION: 162 setHealthStatus(false, exceptionStackTrace); 163 break; 164 case FAILED_WITH_EXIT_CODE: 165 setHealthStatus(true, "", now); 166 break; 167 case FAILED: 168 setHealthStatus(false, shexec.getOutput()); 169 break; 170 } 171 } 172 173 /** 174 * Method to check if the output string has line which begins with ERROR. 175 * 176 * @param output 177 * string 178 * @return true if output string has error pattern in it. 179 */ 180 private boolean hasErrors(String output) { 181 String[] splits = output.split("\n"); 182 for (String split : splits) { 183 if (split.startsWith(ERROR_PATTERN)) { 184 return true; 185 } 186 } 187 return false; 188 } 189 } 190 191 public NodeHealthScriptRunner(String scriptName, long chkInterval, long timeout, 192 String[] scriptArgs) { 193 super(NodeHealthScriptRunner.class.getName()); 194 this.lastReportedTime = System.currentTimeMillis(); 195 this.isHealthy = true; 196 this.healthReport = ""; 197 this.nodeHealthScript = scriptName; 198 this.intervalTime = chkInterval; 199 this.scriptTimeout = timeout; 200 this.timer = new NodeHealthMonitorExecutor(scriptArgs); 201 } 202 203 /* 204 * Method which initializes the values for the script path and interval time. 205 */ 206 @Override 207 protected void serviceInit(Configuration conf) throws Exception { 208 super.serviceInit(conf); 209 } 210 211 /** 212 * Method used to start the Node health monitoring. 213 * 214 */ 215 @Override 216 protected void serviceStart() throws Exception { 217 nodeHealthScriptScheduler = new Timer("NodeHealthMonitor-Timer", true); 218 // Start the timer task immediately and 219 // then periodically at interval time. 220 nodeHealthScriptScheduler.scheduleAtFixedRate(timer, 0, intervalTime); 221 super.serviceStart(); 222 } 223 224 /** 225 * Method used to terminate the node health monitoring service. 226 * 227 */ 228 @Override 229 protected void serviceStop() { 230 if (nodeHealthScriptScheduler != null) { 231 nodeHealthScriptScheduler.cancel(); 232 } 233 if (shexec != null) { 234 Process p = shexec.getProcess(); 235 if (p != null) { 236 p.destroy(); 237 } 238 } 239 } 240 241 /** 242 * Gets the if the node is healthy or not 243 * 244 * @return true if node is healthy 245 */ 246 public boolean isHealthy() { 247 return isHealthy; 248 } 249 250 /** 251 * Sets if the node is healhty or not considering disks' health also. 252 * 253 * @param isHealthy 254 * if or not node is healthy 255 */ 256 private synchronized void setHealthy(boolean isHealthy) { 257 this.isHealthy = isHealthy; 258 } 259 260 /** 261 * Returns output from health script. if node is healthy then an empty string 262 * is returned. 263 * 264 * @return output from health script 265 */ 266 public String getHealthReport() { 267 return healthReport; 268 } 269 270 /** 271 * Sets the health report from the node health script. Also set the disks' 272 * health info obtained from DiskHealthCheckerService. 273 * 274 * @param healthReport 275 */ 276 private synchronized void setHealthReport(String healthReport) { 277 this.healthReport = healthReport; 278 } 279 280 /** 281 * Returns time stamp when node health script was last run. 282 * 283 * @return timestamp when node health script was last run 284 */ 285 public long getLastReportedTime() { 286 return lastReportedTime; 287 } 288 289 /** 290 * Sets the last run time of the node health script. 291 * 292 * @param lastReportedTime 293 */ 294 private synchronized void setLastReportedTime(long lastReportedTime) { 295 this.lastReportedTime = lastReportedTime; 296 } 297 298 /** 299 * Method used to determine if or not node health monitoring service should be 300 * started or not. Returns true if following conditions are met: 301 * 302 * <ol> 303 * <li>Path to Node health check script is not empty</li> 304 * <li>Node health check script file exists</li> 305 * </ol> 306 * 307 * @return true if node health monitoring service can be started. 308 */ 309 public static boolean shouldRun(String healthScript) { 310 if (healthScript == null || healthScript.trim().isEmpty()) { 311 return false; 312 } 313 File f = new File(healthScript); 314 return f.exists() && FileUtil.canExecute(f); 315 } 316 317 private synchronized void setHealthStatus(boolean isHealthy, String output) { 318 LOG.info("health status being set as " + output); 319 this.setHealthy(isHealthy); 320 this.setHealthReport(output); 321 } 322 323 private synchronized void setHealthStatus(boolean isHealthy, String output, 324 long time) { 325 LOG.info("health status being set as " + output); 326 this.setHealthStatus(isHealthy, output); 327 this.setLastReportedTime(time); 328 } 329 330 /** 331 * Used only by tests to access the timer task directly 332 * @return the timer task 333 */ 334 public TimerTask getTimerTask() { 335 return timer; 336 } 337}