001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.ha; 019 020import java.io.IOException; 021import java.io.PrintStream; 022import java.util.ArrayList; 023import java.util.Arrays; 024import java.util.Collection; 025import java.util.Map; 026 027import org.apache.commons.cli.Options; 028import org.apache.commons.cli.CommandLine; 029import org.apache.commons.cli.GnuParser; 030import org.apache.commons.cli.ParseException; 031import org.apache.commons.logging.Log; 032import org.apache.commons.logging.LogFactory; 033 034import org.apache.hadoop.classification.InterfaceAudience; 035import org.apache.hadoop.conf.Configuration; 036import org.apache.hadoop.conf.Configured; 037import org.apache.hadoop.fs.CommonConfigurationKeys; 038import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState; 039import org.apache.hadoop.ha.HAServiceProtocol.StateChangeRequestInfo; 040import org.apache.hadoop.ha.HAServiceProtocol.RequestSource; 041import org.apache.hadoop.util.Tool; 042import org.apache.hadoop.util.ToolRunner; 043 044import com.google.common.base.Preconditions; 045import com.google.common.collect.ImmutableMap; 046 047/** 048 * A command-line tool for making calls in the HAServiceProtocol. 049 * For example,. this can be used to force a service to standby or active 050 * mode, or to trigger a health-check. 051 */ 052@InterfaceAudience.Private 053 054public abstract class HAAdmin extends Configured implements Tool { 055 056 private static final String FORCEFENCE = "forcefence"; 057 private static final String FORCEACTIVE = "forceactive"; 058 059 /** 060 * Undocumented flag which allows an administrator to use manual failover 061 * state transitions even when auto-failover is enabled. This is an unsafe 062 * operation, which is why it is not documented in the usage below. 063 */ 064 private static final String FORCEMANUAL = "forcemanual"; 065 private static final Log LOG = LogFactory.getLog(HAAdmin.class); 066 067 private int rpcTimeoutForChecks = -1; 068 069 protected final static Map<String, UsageInfo> USAGE = 070 ImmutableMap.<String, UsageInfo>builder() 071 .put("-transitionToActive", 072 new UsageInfo("[--"+FORCEACTIVE+"] <serviceId>", "Transitions the service into Active state")) 073 .put("-transitionToStandby", 074 new UsageInfo("<serviceId>", "Transitions the service into Standby state")) 075 .put("-failover", 076 new UsageInfo("[--"+FORCEFENCE+"] [--"+FORCEACTIVE+"] <serviceId> <serviceId>", 077 "Failover from the first service to the second.\n" + 078 "Unconditionally fence services if the --"+FORCEFENCE+" option is used.\n" + 079 "Try to failover to the target service even if it is not ready if the " + 080 "--" + FORCEACTIVE + " option is used.")) 081 .put("-getServiceState", 082 new UsageInfo("<serviceId>", "Returns the state of the service")) 083 .put("-checkHealth", 084 new UsageInfo("<serviceId>", 085 "Requests that the service perform a health check.\n" + 086 "The HAAdmin tool will exit with a non-zero exit code\n" + 087 "if the check fails.")) 088 .put("-help", 089 new UsageInfo("<command>", "Displays help on the specified command")) 090 .build(); 091 092 /** Output stream for errors, for use in tests */ 093 protected PrintStream errOut = System.err; 094 protected PrintStream out = System.out; 095 private RequestSource requestSource = RequestSource.REQUEST_BY_USER; 096 097 protected HAAdmin() { 098 super(); 099 } 100 101 protected HAAdmin(Configuration conf) { 102 super(conf); 103 } 104 105 protected abstract HAServiceTarget resolveTarget(String string); 106 107 protected Collection<String> getTargetIds(String targetNodeToActivate) { 108 return new ArrayList<String>( 109 Arrays.asList(new String[]{targetNodeToActivate})); 110 } 111 112 protected String getUsageString() { 113 return "Usage: HAAdmin"; 114 } 115 116 protected void printUsage(PrintStream errOut) { 117 errOut.println(getUsageString()); 118 for (Map.Entry<String, UsageInfo> e : USAGE.entrySet()) { 119 String cmd = e.getKey(); 120 UsageInfo usage = e.getValue(); 121 122 errOut.println(" [" + cmd + " " + usage.args + "]"); 123 } 124 errOut.println(); 125 ToolRunner.printGenericCommandUsage(errOut); 126 } 127 128 private void printUsage(PrintStream errOut, String cmd) { 129 UsageInfo usage = USAGE.get(cmd); 130 if (usage == null) { 131 throw new RuntimeException("No usage for cmd " + cmd); 132 } 133 errOut.println(getUsageString() + " [" + cmd + " " + usage.args + "]"); 134 } 135 136 private int transitionToActive(final CommandLine cmd) 137 throws IOException, ServiceFailedException { 138 String[] argv = cmd.getArgs(); 139 if (argv.length != 1) { 140 errOut.println("transitionToActive: incorrect number of arguments"); 141 printUsage(errOut, "-transitionToActive"); 142 return -1; 143 } 144 /* returns true if other target node is active or some exception occurred 145 and forceActive was not set */ 146 if(!cmd.hasOption(FORCEACTIVE)) { 147 if(isOtherTargetNodeActive(argv[0], cmd.hasOption(FORCEACTIVE))) { 148 return -1; 149 } 150 } 151 HAServiceTarget target = resolveTarget(argv[0]); 152 if (!checkManualStateManagementOK(target)) { 153 return -1; 154 } 155 HAServiceProtocol proto = target.getProxy( 156 getConf(), 0); 157 HAServiceProtocolHelper.transitionToActive(proto, createReqInfo()); 158 return 0; 159 } 160 161 /** 162 * Checks whether other target node is active or not 163 * @param targetNodeToActivate 164 * @return true if other target node is active or some other exception 165 * occurred and forceActive was set otherwise false 166 * @throws IOException 167 */ 168 private boolean isOtherTargetNodeActive(String targetNodeToActivate, boolean forceActive) 169 throws IOException { 170 Collection<String> targetIds = getTargetIds(targetNodeToActivate); 171 targetIds.remove(targetNodeToActivate); 172 for(String targetId : targetIds) { 173 HAServiceTarget target = resolveTarget(targetId); 174 if (!checkManualStateManagementOK(target)) { 175 return true; 176 } 177 try { 178 HAServiceProtocol proto = target.getProxy(getConf(), 5000); 179 if(proto.getServiceStatus().getState() == HAServiceState.ACTIVE) { 180 errOut.println("transitionToActive: Node " + targetId +" is already active"); 181 printUsage(errOut, "-transitionToActive"); 182 return true; 183 } 184 } catch (Exception e) { 185 //If forceActive switch is false then return true 186 if(!forceActive) { 187 errOut.println("Unexpected error occurred " + e.getMessage()); 188 printUsage(errOut, "-transitionToActive"); 189 return true; 190 } 191 } 192 } 193 return false; 194 } 195 196 private int transitionToStandby(final CommandLine cmd) 197 throws IOException, ServiceFailedException { 198 String[] argv = cmd.getArgs(); 199 if (argv.length != 1) { 200 errOut.println("transitionToStandby: incorrect number of arguments"); 201 printUsage(errOut, "-transitionToStandby"); 202 return -1; 203 } 204 205 HAServiceTarget target = resolveTarget(argv[0]); 206 if (!checkManualStateManagementOK(target)) { 207 return -1; 208 } 209 HAServiceProtocol proto = target.getProxy( 210 getConf(), 0); 211 HAServiceProtocolHelper.transitionToStandby(proto, createReqInfo()); 212 return 0; 213 } 214 /** 215 * Ensure that we are allowed to manually manage the HA state of the target 216 * service. If automatic failover is configured, then the automatic 217 * failover controllers should be doing state management, and it is generally 218 * an error to use the HAAdmin command line to do so. 219 * 220 * @param target the target to check 221 * @return true if manual state management is allowed 222 */ 223 private boolean checkManualStateManagementOK(HAServiceTarget target) { 224 if (target.isAutoFailoverEnabled()) { 225 if (requestSource != RequestSource.REQUEST_BY_USER_FORCED) { 226 errOut.println( 227 "Automatic failover is enabled for " + target + "\n" + 228 "Refusing to manually manage HA state, since it may cause\n" + 229 "a split-brain scenario or other incorrect state.\n" + 230 "If you are very sure you know what you are doing, please \n" + 231 "specify the --" + FORCEMANUAL + " flag."); 232 return false; 233 } else { 234 LOG.warn("Proceeding with manual HA state management even though\n" + 235 "automatic failover is enabled for " + target); 236 return true; 237 } 238 } 239 return true; 240 } 241 242 private StateChangeRequestInfo createReqInfo() { 243 return new StateChangeRequestInfo(requestSource); 244 } 245 246 private int failover(CommandLine cmd) 247 throws IOException, ServiceFailedException { 248 boolean forceFence = cmd.hasOption(FORCEFENCE); 249 boolean forceActive = cmd.hasOption(FORCEACTIVE); 250 251 int numOpts = cmd.getOptions() == null ? 0 : cmd.getOptions().length; 252 final String[] args = cmd.getArgs(); 253 254 if (numOpts > 3 || args.length != 2) { 255 errOut.println("failover: incorrect arguments"); 256 printUsage(errOut, "-failover"); 257 return -1; 258 } 259 260 HAServiceTarget fromNode = resolveTarget(args[0]); 261 HAServiceTarget toNode = resolveTarget(args[1]); 262 263 // Check that auto-failover is consistently configured for both nodes. 264 Preconditions.checkState( 265 fromNode.isAutoFailoverEnabled() == 266 toNode.isAutoFailoverEnabled(), 267 "Inconsistent auto-failover configs between %s and %s!", 268 fromNode, toNode); 269 270 if (fromNode.isAutoFailoverEnabled()) { 271 if (forceFence || forceActive) { 272 // -forceActive doesn't make sense with auto-HA, since, if the node 273 // is not healthy, then its ZKFC will immediately quit the election 274 // again the next time a health check runs. 275 // 276 // -forceFence doesn't seem to have any real use cases with auto-HA 277 // so it isn't implemented. 278 errOut.println(FORCEFENCE + " and " + FORCEACTIVE + " flags not " + 279 "supported with auto-failover enabled."); 280 return -1; 281 } 282 try { 283 return gracefulFailoverThroughZKFCs(toNode); 284 } catch (UnsupportedOperationException e){ 285 errOut.println("Failover command is not supported with " + 286 "auto-failover enabled: " + e.getLocalizedMessage()); 287 return -1; 288 } 289 } 290 291 FailoverController fc = new FailoverController(getConf(), 292 requestSource); 293 294 try { 295 fc.failover(fromNode, toNode, forceFence, forceActive); 296 out.println("Failover from "+args[0]+" to "+args[1]+" successful"); 297 } catch (FailoverFailedException ffe) { 298 errOut.println("Failover failed: " + ffe.getLocalizedMessage()); 299 return -1; 300 } 301 return 0; 302 } 303 304 305 /** 306 * Initiate a graceful failover by talking to the target node's ZKFC. 307 * This sends an RPC to the ZKFC, which coordinates the failover. 308 * 309 * @param toNode the node to fail to 310 * @return status code (0 for success) 311 * @throws IOException if failover does not succeed 312 */ 313 private int gracefulFailoverThroughZKFCs(HAServiceTarget toNode) 314 throws IOException { 315 316 int timeout = FailoverController.getRpcTimeoutToNewActive(getConf()); 317 ZKFCProtocol proxy = toNode.getZKFCProxy(getConf(), timeout); 318 try { 319 proxy.gracefulFailover(); 320 out.println("Failover to " + toNode + " successful"); 321 } catch (ServiceFailedException sfe) { 322 errOut.println("Failover failed: " + sfe.getLocalizedMessage()); 323 return -1; 324 } 325 326 return 0; 327 } 328 329 private int checkHealth(final CommandLine cmd) 330 throws IOException, ServiceFailedException { 331 String[] argv = cmd.getArgs(); 332 if (argv.length != 1) { 333 errOut.println("checkHealth: incorrect number of arguments"); 334 printUsage(errOut, "-checkHealth"); 335 return -1; 336 } 337 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy( 338 getConf(), rpcTimeoutForChecks); 339 try { 340 HAServiceProtocolHelper.monitorHealth(proto, createReqInfo()); 341 } catch (HealthCheckFailedException e) { 342 errOut.println("Health check failed: " + e.getLocalizedMessage()); 343 return -1; 344 } 345 return 0; 346 } 347 348 private int getServiceState(final CommandLine cmd) 349 throws IOException, ServiceFailedException { 350 String[] argv = cmd.getArgs(); 351 if (argv.length != 1) { 352 errOut.println("getServiceState: incorrect number of arguments"); 353 printUsage(errOut, "-getServiceState"); 354 return -1; 355 } 356 357 HAServiceProtocol proto = resolveTarget(argv[0]).getProxy( 358 getConf(), rpcTimeoutForChecks); 359 out.println(proto.getServiceStatus().getState()); 360 return 0; 361 } 362 363 /** 364 * Return the serviceId as is, we are assuming it was 365 * given as a service address of form <host:ipcport>. 366 */ 367 protected String getServiceAddr(String serviceId) { 368 return serviceId; 369 } 370 371 @Override 372 public void setConf(Configuration conf) { 373 super.setConf(conf); 374 if (conf != null) { 375 rpcTimeoutForChecks = conf.getInt( 376 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_KEY, 377 CommonConfigurationKeys.HA_FC_CLI_CHECK_TIMEOUT_DEFAULT); 378 } 379 } 380 381 @Override 382 public int run(String[] argv) throws Exception { 383 try { 384 return runCmd(argv); 385 } catch (IllegalArgumentException iae) { 386 errOut.println("Illegal argument: " + iae.getLocalizedMessage()); 387 return -1; 388 } catch (IOException ioe) { 389 errOut.println("Operation failed: " + ioe.getLocalizedMessage()); 390 if (LOG.isDebugEnabled()) { 391 LOG.debug("Operation failed", ioe); 392 } 393 return -1; 394 } 395 } 396 397 protected int runCmd(String[] argv) throws Exception { 398 if (argv.length < 1) { 399 printUsage(errOut); 400 return -1; 401 } 402 403 String cmd = argv[0]; 404 405 if (!cmd.startsWith("-")) { 406 errOut.println("Bad command '" + cmd + "': expected command starting with '-'"); 407 printUsage(errOut); 408 return -1; 409 } 410 411 if (!USAGE.containsKey(cmd)) { 412 errOut.println(cmd.substring(1) + ": Unknown command"); 413 printUsage(errOut); 414 return -1; 415 } 416 417 Options opts = new Options(); 418 419 // Add command-specific options 420 if ("-failover".equals(cmd)) { 421 addFailoverCliOpts(opts); 422 } 423 if("-transitionToActive".equals(cmd)) { 424 addTransitionToActiveCliOpts(opts); 425 } 426 // Mutative commands take FORCEMANUAL option 427 if ("-transitionToActive".equals(cmd) || 428 "-transitionToStandby".equals(cmd) || 429 "-failover".equals(cmd)) { 430 opts.addOption(FORCEMANUAL, false, 431 "force manual control even if auto-failover is enabled"); 432 } 433 434 CommandLine cmdLine = parseOpts(cmd, opts, argv); 435 if (cmdLine == null) { 436 // error already printed 437 return -1; 438 } 439 440 if (cmdLine.hasOption(FORCEMANUAL)) { 441 if (!confirmForceManual()) { 442 LOG.fatal("Aborted"); 443 return -1; 444 } 445 // Instruct the NNs to honor this request even if they're 446 // configured for manual failover. 447 requestSource = RequestSource.REQUEST_BY_USER_FORCED; 448 } 449 450 if ("-transitionToActive".equals(cmd)) { 451 return transitionToActive(cmdLine); 452 } else if ("-transitionToStandby".equals(cmd)) { 453 return transitionToStandby(cmdLine); 454 } else if ("-failover".equals(cmd)) { 455 return failover(cmdLine); 456 } else if ("-getServiceState".equals(cmd)) { 457 return getServiceState(cmdLine); 458 } else if ("-checkHealth".equals(cmd)) { 459 return checkHealth(cmdLine); 460 } else if ("-help".equals(cmd)) { 461 return help(argv); 462 } else { 463 // we already checked command validity above, so getting here 464 // would be a coding error 465 throw new AssertionError("Should not get here, command: " + cmd); 466 } 467 } 468 469 private boolean confirmForceManual() throws IOException { 470 return ToolRunner.confirmPrompt( 471 "You have specified the --" + FORCEMANUAL + " flag. This flag is " + 472 "dangerous, as it can induce a split-brain scenario that WILL " + 473 "CORRUPT your HDFS namespace, possibly irrecoverably.\n" + 474 "\n" + 475 "It is recommended not to use this flag, but instead to shut down the " + 476 "cluster and disable automatic failover if you prefer to manually " + 477 "manage your HA state.\n" + 478 "\n" + 479 "You may abort safely by answering 'n' or hitting ^C now.\n" + 480 "\n" + 481 "Are you sure you want to continue?"); 482 } 483 484 /** 485 * Add CLI options which are specific to the failover command and no 486 * others. 487 */ 488 private void addFailoverCliOpts(Options failoverOpts) { 489 failoverOpts.addOption(FORCEFENCE, false, "force fencing"); 490 failoverOpts.addOption(FORCEACTIVE, false, "force failover"); 491 // Don't add FORCEMANUAL, since that's added separately for all commands 492 // that change state. 493 } 494 495 /** 496 * Add CLI options which are specific to the transitionToActive command and 497 * no others. 498 */ 499 private void addTransitionToActiveCliOpts(Options transitionToActiveCliOpts) { 500 transitionToActiveCliOpts.addOption(FORCEACTIVE, false, "force active"); 501 } 502 503 private CommandLine parseOpts(String cmdName, Options opts, String[] argv) { 504 try { 505 // Strip off the first arg, since that's just the command name 506 argv = Arrays.copyOfRange(argv, 1, argv.length); 507 return new GnuParser().parse(opts, argv); 508 } catch (ParseException pe) { 509 errOut.println(cmdName.substring(1) + 510 ": incorrect arguments"); 511 printUsage(errOut, cmdName); 512 return null; 513 } 514 } 515 516 private int help(String[] argv) { 517 if (argv.length == 1) { // only -help 518 printUsage(out); 519 return 0; 520 } else if (argv.length != 2) { 521 printUsage(errOut, "-help"); 522 return -1; 523 } 524 String cmd = argv[1]; 525 if (!cmd.startsWith("-")) { 526 cmd = "-" + cmd; 527 } 528 UsageInfo usageInfo = USAGE.get(cmd); 529 if (usageInfo == null) { 530 errOut.println(cmd + ": Unknown command"); 531 printUsage(errOut); 532 return -1; 533 } 534 535 out.println(cmd + " [" + usageInfo.args + "]: " + usageInfo.help); 536 return 0; 537 } 538 539 protected static class UsageInfo { 540 public final String args; 541 public final String help; 542 543 public UsageInfo(String args, String help) { 544 this.args = args; 545 this.help = help; 546 } 547 } 548}