001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.util;
020
021import java.io.File;
022import java.io.IOException;
023import java.util.ArrayList;
024import java.util.Arrays;
025import java.util.Timer;
026import java.util.TimerTask;
027
028import org.apache.commons.logging.Log;
029import org.apache.commons.logging.LogFactory;
030import org.apache.hadoop.conf.Configuration;
031import org.apache.hadoop.fs.FileUtil;
032import org.apache.hadoop.service.AbstractService;
033import org.apache.hadoop.util.Shell.ExitCodeException;
034import org.apache.hadoop.util.Shell.ShellCommandExecutor;
035import org.apache.hadoop.util.Shell;
036import org.apache.hadoop.util.StringUtils;
037
038/**
039 * 
040 * The class which provides functionality of checking the health of the node
041 * using the configured node health script and reporting back to the service
042 * for which the health checker has been asked to report.
043 */
044public class NodeHealthScriptRunner extends AbstractService {
045
046  private static Log LOG = LogFactory.getLog(NodeHealthScriptRunner.class);
047
048  /** Absolute path to the health script. */
049  private String nodeHealthScript;
050  /** Delay after which node health script to be executed */
051  private long intervalTime;
052  /** Time after which the script should be timedout */
053  private long scriptTimeout;
054  /** Timer used to schedule node health monitoring script execution */
055  private Timer nodeHealthScriptScheduler;
056
057  /** ShellCommandExecutor used to execute monitoring script */
058  ShellCommandExecutor shexec = null;
059
060  /** Pattern used for searching in the output of the node health script */
061  static private final String ERROR_PATTERN = "ERROR";
062
063  /** Time out error message */
064  public static final String NODE_HEALTH_SCRIPT_TIMED_OUT_MSG = "Node health script timed out";
065
066  private boolean isHealthy;
067
068  private String healthReport;
069
070  private long lastReportedTime;
071
072  private TimerTask timer;
073  
074  private enum HealthCheckerExitStatus {
075    SUCCESS,
076    TIMED_OUT,
077    FAILED_WITH_EXIT_CODE,
078    FAILED_WITH_EXCEPTION,
079    FAILED
080  }
081
082
083  /**
084   * Class which is used by the {@link Timer} class to periodically execute the
085   * node health script.
086   * 
087   */
088  private class NodeHealthMonitorExecutor extends TimerTask {
089
090    String exceptionStackTrace = "";
091
092    public NodeHealthMonitorExecutor(String[] args) {
093      ArrayList<String> execScript = new ArrayList<String>();
094      execScript.add(nodeHealthScript);
095      if (args != null) {
096        execScript.addAll(Arrays.asList(args));
097      }
098      shexec = new ShellCommandExecutor(execScript
099          .toArray(new String[execScript.size()]), null, null, scriptTimeout);
100    }
101
102    @Override
103    public void run() {
104      HealthCheckerExitStatus status = HealthCheckerExitStatus.SUCCESS;
105      try {
106        shexec.execute();
107      } catch (ExitCodeException e) {
108        // ignore the exit code of the script
109        status = HealthCheckerExitStatus.FAILED_WITH_EXIT_CODE;
110        // On Windows, we will not hit the Stream closed IOException
111        // thrown by stdout buffered reader for timeout event.
112        if (Shell.WINDOWS && shexec.isTimedOut()) {
113          status = HealthCheckerExitStatus.TIMED_OUT;
114        }
115      } catch (Exception e) {
116        LOG.warn("Caught exception : " + e.getMessage());
117        if (!shexec.isTimedOut()) {
118          status = HealthCheckerExitStatus.FAILED_WITH_EXCEPTION;
119        } else {
120          status = HealthCheckerExitStatus.TIMED_OUT;
121        }
122        exceptionStackTrace = StringUtils.stringifyException(e);
123      } finally {
124        if (status == HealthCheckerExitStatus.SUCCESS) {
125          if (hasErrors(shexec.getOutput())) {
126            status = HealthCheckerExitStatus.FAILED;
127          }
128        }
129        reportHealthStatus(status);
130      }
131    }
132
133    /**
134     * Method which is used to parse output from the node health monitor and
135     * send to the report address.
136     * 
137     * The timed out script or script which causes IOException output is
138     * ignored.
139     * 
140     * The node is marked unhealthy if
141     * <ol>
142     * <li>The node health script times out</li>
143     * <li>The node health scripts output has a line which begins with ERROR</li>
144     * <li>An exception is thrown while executing the script</li>
145     * </ol>
146     * If the script throws {@link IOException} or {@link ExitCodeException} the
147     * output is ignored and node is left remaining healthy, as script might
148     * have syntax error.
149     * 
150     * @param status
151     */
152    void reportHealthStatus(HealthCheckerExitStatus status) {
153      long now = System.currentTimeMillis();
154      switch (status) {
155      case SUCCESS:
156        setHealthStatus(true, "", now);
157        break;
158      case TIMED_OUT:
159        setHealthStatus(false, NODE_HEALTH_SCRIPT_TIMED_OUT_MSG);
160        break;
161      case FAILED_WITH_EXCEPTION:
162        setHealthStatus(false, exceptionStackTrace);
163        break;
164      case FAILED_WITH_EXIT_CODE:
165        setHealthStatus(true, "", now);
166        break;
167      case FAILED:
168        setHealthStatus(false, shexec.getOutput());
169        break;
170      }
171    }
172
173    /**
174     * Method to check if the output string has line which begins with ERROR.
175     * 
176     * @param output
177     *          string
178     * @return true if output string has error pattern in it.
179     */
180    private boolean hasErrors(String output) {
181      String[] splits = output.split("\n");
182      for (String split : splits) {
183        if (split.startsWith(ERROR_PATTERN)) {
184          return true;
185        }
186      }
187      return false;
188    }
189  }
190
191  public NodeHealthScriptRunner(String scriptName, long chkInterval, long timeout,
192      String[] scriptArgs) {
193    super(NodeHealthScriptRunner.class.getName());
194    this.lastReportedTime = System.currentTimeMillis();
195    this.isHealthy = true;
196    this.healthReport = "";
197    this.nodeHealthScript = scriptName;
198    this.intervalTime = chkInterval;
199    this.scriptTimeout = timeout;
200    this.timer = new NodeHealthMonitorExecutor(scriptArgs);
201  }
202
203  /*
204   * Method which initializes the values for the script path and interval time.
205   */
206  @Override
207  protected void serviceInit(Configuration conf) throws Exception {
208    super.serviceInit(conf);
209  }
210
211  /**
212   * Method used to start the Node health monitoring.
213   * 
214   */
215  @Override
216  protected void serviceStart() throws Exception {
217    nodeHealthScriptScheduler = new Timer("NodeHealthMonitor-Timer", true);
218    // Start the timer task immediately and
219    // then periodically at interval time.
220    nodeHealthScriptScheduler.scheduleAtFixedRate(timer, 0, intervalTime);
221    super.serviceStart();
222  }
223
224  /**
225   * Method used to terminate the node health monitoring service.
226   * 
227   */
228  @Override
229  protected void serviceStop() {
230    if (nodeHealthScriptScheduler != null) {
231      nodeHealthScriptScheduler.cancel();
232    }
233    if (shexec != null) {
234      Process p = shexec.getProcess();
235      if (p != null) {
236        p.destroy();
237      }
238    }
239  }
240
241  /**
242   * Gets the if the node is healthy or not
243   * 
244   * @return true if node is healthy
245   */
246  public boolean isHealthy() {
247    return isHealthy;
248  }
249
250  /**
251   * Sets if the node is healhty or not considering disks' health also.
252   * 
253   * @param isHealthy
254   *          if or not node is healthy
255   */
256  private synchronized void setHealthy(boolean isHealthy) {
257    this.isHealthy = isHealthy;
258  }
259
260  /**
261   * Returns output from health script. if node is healthy then an empty string
262   * is returned.
263   * 
264   * @return output from health script
265   */
266  public String getHealthReport() {
267    return healthReport;
268  }
269
270  /**
271   * Sets the health report from the node health script. Also set the disks'
272   * health info obtained from DiskHealthCheckerService.
273   *
274   * @param healthReport
275   */
276  private synchronized void setHealthReport(String healthReport) {
277    this.healthReport = healthReport;
278  }
279  
280  /**
281   * Returns time stamp when node health script was last run.
282   * 
283   * @return timestamp when node health script was last run
284   */
285  public long getLastReportedTime() {
286    return lastReportedTime;
287  }
288
289  /**
290   * Sets the last run time of the node health script.
291   * 
292   * @param lastReportedTime
293   */
294  private synchronized void setLastReportedTime(long lastReportedTime) {
295    this.lastReportedTime = lastReportedTime;
296  }
297
298  /**
299   * Method used to determine if or not node health monitoring service should be
300   * started or not. Returns true if following conditions are met:
301   * 
302   * <ol>
303   * <li>Path to Node health check script is not empty</li>
304   * <li>Node health check script file exists</li>
305   * </ol>
306   * 
307   * @return true if node health monitoring service can be started.
308   */
309  public static boolean shouldRun(String healthScript) {
310    if (healthScript == null || healthScript.trim().isEmpty()) {
311      return false;
312    }
313    File f = new File(healthScript);
314    return f.exists() && FileUtil.canExecute(f);
315  }
316
317  private synchronized void setHealthStatus(boolean isHealthy, String output) {
318                LOG.info("health status being set as " + output);
319    this.setHealthy(isHealthy);
320    this.setHealthReport(output);
321  }
322  
323  private synchronized void setHealthStatus(boolean isHealthy, String output,
324      long time) {
325        LOG.info("health status being set as " + output);
326    this.setHealthStatus(isHealthy, output);
327    this.setLastReportedTime(time);
328  }
329
330  /**
331   * Used only by tests to access the timer task directly
332   * @return the timer task
333   */
334  public TimerTask getTimerTask() {
335    return timer;
336  }
337}