001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package org.apache.hadoop.ha;
019
020import java.io.IOException;
021import java.io.PrintStream;
022import java.util.ArrayList;
023import java.util.Arrays;
024import java.util.Collection;
025import java.util.Map;
026
027import org.apache.commons.cli.Options;
028import org.apache.commons.cli.CommandLine;
029import org.apache.commons.cli.GnuParser;
030import org.apache.commons.cli.ParseException;
031import org.apache.commons.logging.Log;
032import org.apache.commons.logging.LogFactory;
033
034import org.apache.hadoop.classification.InterfaceAudience;
035import org.apache.hadoop.conf.Configuration;
036import org.apache.hadoop.conf.Configured;
037import org.apache.hadoop.fs.CommonConfigurationKeys;
038import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
039import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo;
040import org.apache.hadoop.ha.HAServiceProtocol.RequestSource;
041import org.apache.hadoop.util.Tool;
042import org.apache.hadoop.util.ToolRunner;
043
044import com.google.common.base.Preconditions;
045import com.google.common.collect.ImmutableMap;
046
047/**
048 * A command-line tool for making calls in the HAServiceProtocol.
049 * For example,. this can be used to force a service to standby or active
050 * mode, or to trigger a health-check.
051 */
052@InterfaceAudience.Private
053
054public abstract class HAAdmin extends Configured implements Tool {
055  
056  private static final String FORCEFENCE  = "forcefence";
057  private static final String FORCEACTIVE = "forceactive";
058  
059  /**
060   * Undocumented flag which allows an administrator to use manual failover
061   * state transitions even when auto-failover is enabled. This is an unsafe
062   * operation, which is why it is not documented in the usage below.
063   */
064  private static final String FORCEMANUAL = "forcemanual";
065  private static final Log LOG = LogFactory.getLog(HAAdmin.class);
066
067  private int rpcTimeoutForChecks = -1;
068  
069  protected final static Map<String, UsageInfo> USAGE =
070    ImmutableMap.<String, UsageInfo>builder()
071    .put("-transitionToActive",
072        new UsageInfo("[--"+FORCEACTIVE+"] <serviceId>", "Transitions the service into Active state"))
073    .put("-transitionToStandby",
074        new UsageInfo("<serviceId>", "Transitions the service into Standby state"))
075    .put("-failover",
076        new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>",
077            "Failover from the first service to the second.\n" +
078            "Unconditionally fence services if the --"+FORCEFENCE+" option is used.\n" +
079            "Try to failover to the target service even if it is not ready if the " + 
080            "--" + FORCEACTIVE + " option is used."))
081    .put("-getServiceState",
082        new UsageInfo("<serviceId>", "Returns the state of the service"))
083    .put("-checkHealth",
084        new UsageInfo("<serviceId>",
085            "Requests that the service perform a health check.\n" + 
086            "The HAAdmin tool will exit with a non-zero exit code\n" +
087            "if the check fails."))
088    .put("-help",
089        new UsageInfo("<command>", "Displays help on the specified command"))
090    .build();
091
092  /** Output stream for errors, for use in tests */
093  protected PrintStream errOut = System.err;
094  protected PrintStream out = System.out;
095  private RequestSource requestSource = RequestSource.REQUEST_BY_USER;
096
097  protected HAAdmin() {
098    super();
099  }
100
101  protected HAAdmin(Configuration conf) {
102    super(conf);
103  }
104
105  protected abstract HAServiceTarget resolveTarget(String string);
106  
107  protected Collection<String> getTargetIds(String targetNodeToActivate) {
108    return new ArrayList<String>(
109        Arrays.asList(new String[]{targetNodeToActivate}));
110  }
111
112  protected String getUsageString() {
113    return "Usage: HAAdmin";
114  }
115
116  protected void printUsage(PrintStream errOut) {
117    errOut.println(getUsageString());
118    for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) {
119      String cmd = e.getKey();
120      UsageInfo usage = e.getValue();
121      
122      errOut.println("    [" + cmd + " " + usage.args + "]"); 
123    }
124    errOut.println();
125    ToolRunner.printGenericCommandUsage(errOut);    
126  }
127  
128  private void printUsage(PrintStream errOut, String cmd) {
129    UsageInfo usage = USAGE.get(cmd);
130    if (usage == null) {
131      throw new RuntimeException("No usage for cmd " + cmd);
132    }
133    errOut.println(getUsageString() + " [" + cmd + " " + usage.args + "]");
134  }
135
136  private int transitionToActive(final CommandLine cmd)
137      throws IOException, ServiceFailedException {
138    String[] argv = cmd.getArgs();
139    if (argv.length != 1) {
140      errOut.println("transitionToActive: incorrect number of arguments");
141      printUsage(errOut, "-transitionToActive");
142      return -1;
143    }
144    /*  returns true if other target node is active or some exception occurred 
145        and forceActive was not set  */
146    if(!cmd.hasOption(FORCEACTIVE)) {
147      if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) {
148        return -1;
149      }
150    }
151    HAServiceTarget target = resolveTarget(argv[0]);
152    if (!checkManualStateManagementOK(target)) {
153      return -1;
154    }
155    HAServiceProtocol proto = target.getProxy(
156        getConf(), 0);
157    HAServiceProtocolHelper.transitionToActive(proto, createReqInfo());
158    return 0;
159  }
160  
161  /**
162   * Checks whether other target node is active or not
163   * @param targetNodeToActivate
164   * @return true if other target node is active or some other exception 
165   * occurred and forceActive was set otherwise false
166   * @throws IOException
167   */
168  private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive)
169      throws IOException  {
170    Collection<String> targetIds = getTargetIds(targetNodeToActivate);
171    targetIds.remove(targetNodeToActivate);
172    for(String targetId : targetIds) {
173      HAServiceTarget target = resolveTarget(targetId);
174      if (!checkManualStateManagementOK(target)) {
175        return true;
176      }
177      try {
178        HAServiceProtocol proto = target.getProxy(getConf(), 5000);
179        if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) {
180          errOut.println("transitionToActive: Node " +  targetId +" is already active");
181          printUsage(errOut, "-transitionToActive");
182          return true;
183        }
184      } catch (Exception e) {
185        //If forceActive switch is false then return true
186        if(!forceActive) {
187          errOut.println("Unexpected error occurred  " + e.getMessage());
188          printUsage(errOut, "-transitionToActive");
189          return true; 
190        }
191      }
192    }
193    return false;
194  }
195  
196  private int transitionToStandby(final CommandLine cmd)
197      throws IOException, ServiceFailedException {
198    String[] argv = cmd.getArgs();
199    if (argv.length != 1) {
200      errOut.println("transitionToStandby: incorrect number of arguments");
201      printUsage(errOut, "-transitionToStandby");
202      return -1;
203    }
204    
205    HAServiceTarget target = resolveTarget(argv[0]);
206    if (!checkManualStateManagementOK(target)) {
207      return -1;
208    }
209    HAServiceProtocol proto = target.getProxy(
210        getConf(), 0);
211    HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo());
212    return 0;
213  }
214  /**
215   * Ensure that we are allowed to manually manage the HA state of the target
216   * service. If automatic failover is configured, then the automatic
217   * failover controllers should be doing state management, and it is generally
218   * an error to use the HAAdmin command line to do so.
219   * 
220   * @param target the target to check
221   * @return true if manual state management is allowed
222   */
223  private boolean checkManualStateManagementOK(HAServiceTarget target) {
224    if (target.isAutoFailoverEnabled()) {
225      if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) {
226        errOut.println(
227            "Automatic failover is enabled for " + target + "\n" +
228            "Refusing to manually manage HA state, since it may cause\n" +
229            "a split-brain scenario or other incorrect state.\n" +
230            "If you are very sure you know what you are doing, please \n" +
231            "specify the --" + FORCEMANUAL + " flag.");
232        return false;
233      } else {
234        LOG.warn("Proceeding with manual HA state management even though\n" +
235            "automatic failover is enabled for " + target);
236        return true;
237      }
238    }
239    return true;
240  }
241
242  private StateChangeRequestInfo createReqInfo() {
243    return new StateChangeRequestInfo(requestSource);
244  }
245
246  private int failover(CommandLine cmd)
247      throws IOException, ServiceFailedException {
248    boolean forceFence = cmd.hasOption(FORCEFENCE);
249    boolean forceActive = cmd.hasOption(FORCEACTIVE);
250
251    int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length;
252    final String[] args = cmd.getArgs();
253
254    if (numOpts > 3 || args.length != 2) {
255      errOut.println("failover: incorrect arguments");
256      printUsage(errOut, "-failover");
257      return -1;
258    }
259
260    HAServiceTarget fromNode = resolveTarget(args[0]);
261    HAServiceTarget toNode = resolveTarget(args[1]);
262    
263    // Check that auto-failover is consistently configured for both nodes.
264    Preconditions.checkState(
265        fromNode.isAutoFailoverEnabled() ==
266          toNode.isAutoFailoverEnabled(),
267          "Inconsistent auto-failover configs between %s and %s!",
268          fromNode, toNode);
269    
270    if (fromNode.isAutoFailoverEnabled()) {
271      if (forceFence || forceActive) {
272        // -forceActive doesn't make sense with auto-HA, since, if the node
273        // is not healthy, then its ZKFC will immediately quit the election
274        // again the next time a health check runs.
275        //
276        // -forceFence doesn't seem to have any real use cases with auto-HA
277        // so it isn't implemented.
278        errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " +
279            "supported with auto-failover enabled.");
280        return -1;
281      }
282      try {
283        return gracefulFailoverThroughZKFCs(toNode);
284      } catch (UnsupportedOperationException e){
285        errOut.println("Failover command is not supported with " +
286            "auto-failover enabled: " + e.getLocalizedMessage());
287        return -1;
288      }
289    }
290    
291    FailoverController fc = new FailoverController(getConf(),
292        requestSource);
293    
294    try {
295      fc.failover(fromNode, toNode, forceFence, forceActive); 
296      out.println("Failover from "+args[0]+" to "+args[1]+" successful");
297    } catch (FailoverFailedException ffe) {
298      errOut.println("Failover failed: " + ffe.getLocalizedMessage());
299      return -1;
300    }
301    return 0;
302  }
303  
304
305  /**
306   * Initiate a graceful failover by talking to the target node's ZKFC.
307   * This sends an RPC to the ZKFC, which coordinates the failover.
308   * 
309   * @param toNode the node to fail to
310   * @return status code (0 for success)
311   * @throws IOException if failover does not succeed
312   */
313  private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode)
314      throws IOException {
315
316    int timeout = FailoverController.getRpcTimeoutToNewActive(getConf());
317    ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout);
318    try {
319      proxy.gracefulFailover();
320      out.println("Failover to " + toNode + " successful");
321    } catch (ServiceFailedException sfe) {
322      errOut.println("Failover failed: " + sfe.getLocalizedMessage());
323      return -1;
324    }
325
326    return 0;
327  }
328
329  private int checkHealth(final CommandLine cmd)
330      throws IOException, ServiceFailedException {
331    String[] argv = cmd.getArgs();
332    if (argv.length != 1) {
333      errOut.println("checkHealth: incorrect number of arguments");
334      printUsage(errOut, "-checkHealth");
335      return -1;
336    }
337    HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
338        getConf(), rpcTimeoutForChecks);
339    try {
340      HAServiceProtocolHelper.monitorHealth(proto, createReqInfo());
341    } catch (HealthCheckFailedException e) {
342      errOut.println("Health check failed: " + e.getLocalizedMessage());
343      return -1;
344    }
345    return 0;
346  }
347
348  private int getServiceState(final CommandLine cmd)
349      throws IOException, ServiceFailedException {
350    String[] argv = cmd.getArgs();
351    if (argv.length != 1) {
352      errOut.println("getServiceState: incorrect number of arguments");
353      printUsage(errOut, "-getServiceState");
354      return -1;
355    }
356
357    HAServiceProtocol proto = resolveTarget(argv[0]).getProxy(
358        getConf(), rpcTimeoutForChecks);
359    out.println(proto.getServiceStatus().getState());
360    return 0;
361  }
362
363  /**
364   * Return the serviceId as is, we are assuming it was
365   * given as a service address of form <host:ipcport>.
366   */
367  protected String getServiceAddr(String serviceId) {
368    return serviceId;
369  }
370
371  @Override
372  public void setConf(Configuration conf) {
373    super.setConf(conf);
374    if (conf != null) {
375      rpcTimeoutForChecks = conf.getInt(
376          CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY,
377          CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT);
378    }
379  }
380
381  @Override
382  public int run(String[] argv) throws Exception {
383    try {
384      return runCmd(argv);
385    } catch (IllegalArgumentException iae) {
386      errOut.println("Illegal argument: " + iae.getLocalizedMessage());
387      return -1;
388    } catch (IOException ioe) {
389      errOut.println("Operation failed: " + ioe.getLocalizedMessage());
390      if (LOG.isDebugEnabled()) {
391        LOG.debug("Operation failed", ioe);
392      }
393      return -1;
394    }
395  }
396  
397  protected int runCmd(String[] argv) throws Exception {
398    if (argv.length < 1) {
399      printUsage(errOut);
400      return -1;
401    }
402
403    String cmd = argv[0];
404
405    if (!cmd.startsWith("-")) {
406      errOut.println("Bad command '" + cmd + "': expected command starting with '-'");
407      printUsage(errOut);
408      return -1;
409    }
410    
411    if (!USAGE.containsKey(cmd)) {
412      errOut.println(cmd.substring(1) + ": Unknown command");
413      printUsage(errOut);
414      return -1;
415    }
416    
417    Options opts = new Options();
418
419    // Add command-specific options
420    if ("-failover".equals(cmd)) {
421      addFailoverCliOpts(opts);
422    }
423    if("-transitionToActive".equals(cmd)) {
424      addTransitionToActiveCliOpts(opts);
425    }
426    // Mutative commands take FORCEMANUAL option
427    if ("-transitionToActive".equals(cmd) ||
428        "-transitionToStandby".equals(cmd) ||
429        "-failover".equals(cmd)) {
430      opts.addOption(FORCEMANUAL, false,
431          "force manual control even if auto-failover is enabled");
432    }
433         
434    CommandLine cmdLine = parseOpts(cmd, opts, argv);
435    if (cmdLine == null) {
436      // error already printed
437      return -1;
438    }
439    
440    if (cmdLine.hasOption(FORCEMANUAL)) {
441      if (!confirmForceManual()) {
442        LOG.fatal("Aborted");
443        return -1;
444      }
445      // Instruct the NNs to honor this request even if they're
446      // configured for manual failover.
447      requestSource = RequestSource.REQUEST_BY_USER_FORCED;
448    }
449
450    if ("-transitionToActive".equals(cmd)) {
451      return transitionToActive(cmdLine);
452    } else if ("-transitionToStandby".equals(cmd)) {
453      return transitionToStandby(cmdLine);
454    } else if ("-failover".equals(cmd)) {
455      return failover(cmdLine);
456    } else if ("-getServiceState".equals(cmd)) {
457      return getServiceState(cmdLine);
458    } else if ("-checkHealth".equals(cmd)) {
459      return checkHealth(cmdLine);
460    } else if ("-help".equals(cmd)) {
461      return help(argv);
462    } else {
463      // we already checked command validity above, so getting here
464      // would be a coding error
465      throw new AssertionError("Should not get here, command: " + cmd);
466    } 
467  }
468  
469  private boolean confirmForceManual() throws IOException {
470     return ToolRunner.confirmPrompt(
471        "You have specified the --" + FORCEMANUAL + " flag. This flag is " +
472        "dangerous, as it can induce a split-brain scenario that WILL " +
473        "CORRUPT your HDFS namespace, possibly irrecoverably.\n" +
474        "\n" +
475        "It is recommended not to use this flag, but instead to shut down the " +
476        "cluster and disable automatic failover if you prefer to manually " +
477        "manage your HA state.\n" +
478        "\n" +
479        "You may abort safely by answering 'n' or hitting ^C now.\n" +
480        "\n" +
481        "Are you sure you want to continue?");
482  }
483
484  /**
485   * Add CLI options which are specific to the failover command and no
486   * others.
487   */
488  private void addFailoverCliOpts(Options failoverOpts) {
489    failoverOpts.addOption(FORCEFENCE, false, "force fencing");
490    failoverOpts.addOption(FORCEACTIVE, false, "force failover");
491    // Don't add FORCEMANUAL, since that's added separately for all commands
492    // that change state.
493  }
494  
495  /**
496   * Add CLI options which are specific to the transitionToActive command and
497   * no others.
498   */
499  private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) {
500    transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active");
501  }
502  
503  private CommandLine parseOpts(String cmdName, Options opts, String[] argv) {
504    try {
505      // Strip off the first arg, since that's just the command name
506      argv = Arrays.copyOfRange(argv, 1, argv.length); 
507      return new GnuParser().parse(opts, argv);
508    } catch (ParseException pe) {
509      errOut.println(cmdName.substring(1) +
510          ": incorrect arguments");
511      printUsage(errOut, cmdName);
512      return null;
513    }
514  }
515  
516  private int help(String[] argv) {
517    if (argv.length == 1) { // only -help
518      printUsage(out);
519      return 0;
520    } else if (argv.length != 2) {
521      printUsage(errOut, "-help");
522      return -1;
523    }
524    String cmd = argv[1];
525    if (!cmd.startsWith("-")) {
526      cmd = "-" + cmd;
527    }
528    UsageInfo usageInfo = USAGE.get(cmd);
529    if (usageInfo == null) {
530      errOut.println(cmd + ": Unknown command");
531      printUsage(errOut);
532      return -1;
533    }
534    
535    out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help);
536    return 0;
537  }
538  
539  protected static class UsageInfo {
540    public final String args;
541    public final String help;
542    
543    public UsageInfo(String args, String help) {
544      this.args = args;
545      this.help = help;
546    }
547  }
548}