001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.mapreduce.v2.hs; 020 021import java.io.FileNotFoundException; 022import java.io.IOException; 023import java.net.ConnectException; 024import java.util.ArrayList; 025import java.util.Collection; 026import java.util.Collections; 027import java.util.HashSet; 028import java.util.Iterator; 029import java.util.List; 030import java.util.NavigableSet; 031import java.util.Set; 032import java.util.SortedMap; 033import java.util.TreeMap; 034import java.util.concurrent.ConcurrentHashMap; 035import java.util.concurrent.ConcurrentMap; 036import java.util.concurrent.ConcurrentSkipListMap; 037import java.util.concurrent.LinkedBlockingQueue; 038import java.util.concurrent.ThreadFactory; 039import java.util.concurrent.ThreadPoolExecutor; 040import java.util.concurrent.TimeUnit; 041import java.util.concurrent.atomic.AtomicInteger; 042 043import org.apache.commons.logging.Log; 044import org.apache.commons.logging.LogFactory; 045import org.apache.hadoop.classification.InterfaceAudience; 046import org.apache.hadoop.classification.InterfaceStability; 047import org.apache.hadoop.conf.Configuration; 048import org.apache.hadoop.fs.FSDataInputStream; 049import org.apache.hadoop.fs.FileAlreadyExistsException; 050import org.apache.hadoop.fs.FileContext; 051import org.apache.hadoop.fs.FileStatus; 052import org.apache.hadoop.fs.Options; 053import org.apache.hadoop.fs.Path; 054import org.apache.hadoop.fs.PathFilter; 055import org.apache.hadoop.fs.RemoteIterator; 056import org.apache.hadoop.fs.UnsupportedFileSystemException; 057import org.apache.hadoop.fs.permission.FsPermission; 058import org.apache.hadoop.mapred.JobACLsManager; 059import org.apache.hadoop.mapreduce.jobhistory.JobSummary; 060import org.apache.hadoop.mapreduce.v2.api.records.JobId; 061import org.apache.hadoop.mapreduce.v2.app.job.Job; 062import org.apache.hadoop.mapreduce.v2.jobhistory.FileNameIndexUtils; 063import org.apache.hadoop.mapreduce.v2.jobhistory.JHAdminConfig; 064import org.apache.hadoop.mapreduce.v2.jobhistory.JobHistoryUtils; 065import org.apache.hadoop.mapreduce.v2.jobhistory.JobIndexInfo; 066import org.apache.hadoop.security.AccessControlException; 067import org.apache.hadoop.service.AbstractService; 068import org.apache.hadoop.util.ShutdownThreadsHelper; 069import org.apache.hadoop.yarn.exceptions.YarnRuntimeException; 070 071import com.google.common.annotations.VisibleForTesting; 072import com.google.common.util.concurrent.ThreadFactoryBuilder; 073import org.apache.hadoop.yarn.util.Clock; 074import org.apache.hadoop.yarn.util.SystemClock; 075 076/** 077 * This class provides a way to interact with history files in a thread safe 078 * manor. 079 */ 080@InterfaceAudience.Public 081@InterfaceStability.Unstable 082public class HistoryFileManager extends AbstractService { 083 private static final Log LOG = LogFactory.getLog(HistoryFileManager.class); 084 private static final Log SUMMARY_LOG = LogFactory.getLog(JobSummary.class); 085 086 private static enum HistoryInfoState { 087 IN_INTERMEDIATE, IN_DONE, DELETED, MOVE_FAILED 088 }; 089 090 private static String DONE_BEFORE_SERIAL_TAIL = JobHistoryUtils 091 .doneSubdirsBeforeSerialTail(); 092 093 /** 094 * Maps between a serial number (generated based on jobId) and the timestamp 095 * component(s) to which it belongs. Facilitates jobId based searches. If a 096 * jobId is not found in this list - it will not be found. 097 */ 098 private static class SerialNumberIndex { 099 private SortedMap<String, Set<String>> cache; 100 private int maxSize; 101 102 public SerialNumberIndex(int maxSize) { 103 this.cache = new TreeMap<String, Set<String>>(); 104 this.maxSize = maxSize; 105 } 106 107 public synchronized void add(String serialPart, String timestampPart) { 108 if (!cache.containsKey(serialPart)) { 109 cache.put(serialPart, new HashSet<String>()); 110 if (cache.size() > maxSize) { 111 String key = cache.firstKey(); 112 LOG.error("Dropping " + key 113 + " from the SerialNumberIndex. We will no " 114 + "longer be able to see jobs that are in that serial index for " 115 + cache.get(key)); 116 cache.remove(key); 117 } 118 } 119 Set<String> datePartSet = cache.get(serialPart); 120 datePartSet.add(timestampPart); 121 } 122 123 public synchronized void remove(String serialPart, String timeStampPart) { 124 if (cache.containsKey(serialPart)) { 125 Set<String> set = cache.get(serialPart); 126 set.remove(timeStampPart); 127 if (set.isEmpty()) { 128 cache.remove(serialPart); 129 } 130 } 131 } 132 133 public synchronized Set<String> get(String serialPart) { 134 Set<String> found = cache.get(serialPart); 135 if (found != null) { 136 return new HashSet<String>(found); 137 } 138 return null; 139 } 140 } 141 142 /** 143 * Wrapper around {@link ConcurrentSkipListMap} that maintains size along 144 * side for O(1) size() implementation for use in JobListCache. 145 * 146 * Note: The size is not updated atomically with changes additions/removals. 147 * This race can lead to size() returning an incorrect size at times. 148 */ 149 static class JobIdHistoryFileInfoMap { 150 private ConcurrentSkipListMap<JobId, HistoryFileInfo> cache; 151 private AtomicInteger mapSize; 152 153 JobIdHistoryFileInfoMap() { 154 cache = new ConcurrentSkipListMap<JobId, HistoryFileInfo>(); 155 mapSize = new AtomicInteger(); 156 } 157 158 public HistoryFileInfo putIfAbsent(JobId key, HistoryFileInfo value) { 159 HistoryFileInfo ret = cache.putIfAbsent(key, value); 160 if (ret == null) { 161 mapSize.incrementAndGet(); 162 } 163 return ret; 164 } 165 166 public HistoryFileInfo remove(JobId key) { 167 HistoryFileInfo ret = cache.remove(key); 168 if (ret != null) { 169 mapSize.decrementAndGet(); 170 } 171 return ret; 172 } 173 174 /** 175 * Returns the recorded size of the internal map. Note that this could be out 176 * of sync with the actual size of the map 177 * @return "recorded" size 178 */ 179 public int size() { 180 return mapSize.get(); 181 } 182 183 public HistoryFileInfo get(JobId key) { 184 return cache.get(key); 185 } 186 187 public NavigableSet<JobId> navigableKeySet() { 188 return cache.navigableKeySet(); 189 } 190 191 public Collection<HistoryFileInfo> values() { 192 return cache.values(); 193 } 194 } 195 196 static class JobListCache { 197 private JobIdHistoryFileInfoMap cache; 198 private int maxSize; 199 private long maxAge; 200 201 public JobListCache(int maxSize, long maxAge) { 202 this.maxSize = maxSize; 203 this.maxAge = maxAge; 204 this.cache = new JobIdHistoryFileInfoMap(); 205 } 206 207 public HistoryFileInfo addIfAbsent(HistoryFileInfo fileInfo) { 208 JobId jobId = fileInfo.getJobId(); 209 if (LOG.isDebugEnabled()) { 210 LOG.debug("Adding " + jobId + " to job list cache with " 211 + fileInfo.getJobIndexInfo()); 212 } 213 HistoryFileInfo old = cache.putIfAbsent(jobId, fileInfo); 214 if (cache.size() > maxSize) { 215 //There is a race here, where more then one thread could be trying to 216 // remove entries. This could result in too many entries being removed 217 // from the cache. This is considered OK as the size of the cache 218 // should be rather large, and we would rather have performance over 219 // keeping the cache size exactly at the maximum. 220 Iterator<JobId> keys = cache.navigableKeySet().iterator(); 221 long cutoff = System.currentTimeMillis() - maxAge; 222 223 // MAPREDUCE-6436: In order to reduce the number of logs written 224 // in case of a lot of move pending histories. 225 JobId firstInIntermediateKey = null; 226 int inIntermediateCount = 0; 227 JobId firstMoveFailedKey = null; 228 int moveFailedCount = 0; 229 230 while(cache.size() > maxSize && keys.hasNext()) { 231 JobId key = keys.next(); 232 HistoryFileInfo firstValue = cache.get(key); 233 if(firstValue != null) { 234 synchronized(firstValue) { 235 if (firstValue.isMovePending()) { 236 if(firstValue.didMoveFail() && 237 firstValue.jobIndexInfo.getFinishTime() <= cutoff) { 238 cache.remove(key); 239 //Now lets try to delete it 240 try { 241 firstValue.delete(); 242 } catch (IOException e) { 243 LOG.error("Error while trying to delete history files" + 244 " that could not be moved to done.", e); 245 } 246 } else { 247 if (firstValue.didMoveFail()) { 248 if (moveFailedCount == 0) { 249 firstMoveFailedKey = key; 250 } 251 moveFailedCount += 1; 252 } else { 253 if (inIntermediateCount == 0) { 254 firstInIntermediateKey = key; 255 } 256 inIntermediateCount += 1; 257 } 258 } 259 } else { 260 cache.remove(key); 261 } 262 } 263 } 264 } 265 // Log output only for first jobhisotry in pendings to restrict 266 // the total number of logs. 267 if (inIntermediateCount > 0) { 268 LOG.warn("Waiting to remove IN_INTERMEDIATE state histories " + 269 "(e.g. " + firstInIntermediateKey + ") from JobListCache " + 270 "because it is not in done yet. Total count is " + 271 inIntermediateCount + "."); 272 } 273 if (moveFailedCount > 0) { 274 LOG.warn("Waiting to remove MOVE_FAILED state histories " + 275 "(e.g. " + firstMoveFailedKey + ") from JobListCache " + 276 "because it is not in done yet. Total count is " + 277 moveFailedCount + "."); 278 } 279 } 280 return old; 281 } 282 283 public void delete(HistoryFileInfo fileInfo) { 284 if (LOG.isDebugEnabled()) { 285 LOG.debug("Removing from cache " + fileInfo); 286 } 287 cache.remove(fileInfo.getJobId()); 288 } 289 290 public Collection<HistoryFileInfo> values() { 291 return new ArrayList<HistoryFileInfo>(cache.values()); 292 } 293 294 public HistoryFileInfo get(JobId jobId) { 295 return cache.get(jobId); 296 } 297 298 public boolean isFull() { 299 return cache.size() >= maxSize; 300 } 301 } 302 303 /** 304 * This class represents a user dir in the intermediate done directory. This 305 * is mostly for locking purposes. 306 */ 307 private class UserLogDir { 308 long modTime = 0; 309 private long scanTime = 0; 310 311 public synchronized void scanIfNeeded(FileStatus fs) { 312 long newModTime = fs.getModificationTime(); 313 // MAPREDUCE-6680: In some Cloud FileSystem, like Azure FS or S3, file's 314 // modification time is truncated into seconds. In that case, 315 // modTime == newModTime doesn't means no file update in the directory, 316 // so we need to have additional check. 317 // Note: modTime (X second Y millisecond) could be casted to X second or 318 // X+1 second. 319 if (modTime != newModTime 320 || (scanTime/1000) == (modTime/1000) 321 || (scanTime/1000 + 1) == (modTime/1000)) { 322 // reset scanTime before scanning happens 323 scanTime = System.currentTimeMillis(); 324 Path p = fs.getPath(); 325 try { 326 scanIntermediateDirectory(p); 327 //If scanning fails, we will scan again. We assume the failure is 328 // temporary. 329 modTime = newModTime; 330 } catch (IOException e) { 331 LOG.error("Error while trying to scan the directory " + p, e); 332 } 333 } else { 334 if (LOG.isDebugEnabled()) { 335 LOG.debug("Scan not needed of " + fs.getPath()); 336 } 337 // reset scanTime 338 scanTime = System.currentTimeMillis(); 339 } 340 } 341 } 342 343 public class HistoryFileInfo { 344 private Path historyFile; 345 private Path confFile; 346 private Path summaryFile; 347 private JobIndexInfo jobIndexInfo; 348 private HistoryInfoState state; 349 350 @VisibleForTesting 351 protected HistoryFileInfo(Path historyFile, Path confFile, 352 Path summaryFile, JobIndexInfo jobIndexInfo, boolean isInDone) { 353 this.historyFile = historyFile; 354 this.confFile = confFile; 355 this.summaryFile = summaryFile; 356 this.jobIndexInfo = jobIndexInfo; 357 state = isInDone ? HistoryInfoState.IN_DONE 358 : HistoryInfoState.IN_INTERMEDIATE; 359 } 360 361 @VisibleForTesting 362 synchronized boolean isMovePending() { 363 return state == HistoryInfoState.IN_INTERMEDIATE 364 || state == HistoryInfoState.MOVE_FAILED; 365 } 366 367 @VisibleForTesting 368 synchronized boolean didMoveFail() { 369 return state == HistoryInfoState.MOVE_FAILED; 370 } 371 372 /** 373 * @return true if the files backed by this were deleted. 374 */ 375 public synchronized boolean isDeleted() { 376 return state == HistoryInfoState.DELETED; 377 } 378 379 @Override 380 public String toString() { 381 return "HistoryFileInfo jobID " + getJobId() 382 + " historyFile = " + historyFile; 383 } 384 385 @VisibleForTesting 386 synchronized void moveToDone() throws IOException { 387 if (LOG.isDebugEnabled()) { 388 LOG.debug("moveToDone: " + historyFile); 389 } 390 if (!isMovePending()) { 391 // It was either deleted or is already in done. Either way do nothing 392 if (LOG.isDebugEnabled()) { 393 LOG.debug("Move no longer pending"); 394 } 395 return; 396 } 397 try { 398 long completeTime = jobIndexInfo.getFinishTime(); 399 if (completeTime == 0) { 400 completeTime = System.currentTimeMillis(); 401 } 402 JobId jobId = jobIndexInfo.getJobId(); 403 404 List<Path> paths = new ArrayList<Path>(2); 405 if (historyFile == null) { 406 LOG.info("No file for job-history with " + jobId + " found in cache!"); 407 } else { 408 paths.add(historyFile); 409 } 410 411 if (confFile == null) { 412 LOG.info("No file for jobConf with " + jobId + " found in cache!"); 413 } else { 414 paths.add(confFile); 415 } 416 417 if (summaryFile == null || !intermediateDoneDirFc.util().exists( 418 summaryFile)) { 419 LOG.info("No summary file for job: " + jobId); 420 } else { 421 String jobSummaryString = getJobSummary(intermediateDoneDirFc, 422 summaryFile); 423 SUMMARY_LOG.info(jobSummaryString); 424 LOG.info("Deleting JobSummary file: [" + summaryFile + "]"); 425 intermediateDoneDirFc.delete(summaryFile, false); 426 summaryFile = null; 427 } 428 429 Path targetDir = canonicalHistoryLogPath(jobId, completeTime); 430 addDirectoryToSerialNumberIndex(targetDir); 431 makeDoneSubdir(targetDir); 432 if (historyFile != null) { 433 Path toPath = doneDirFc.makeQualified(new Path(targetDir, historyFile 434 .getName())); 435 if (!toPath.equals(historyFile)) { 436 moveToDoneNow(historyFile, toPath); 437 historyFile = toPath; 438 } 439 } 440 if (confFile != null) { 441 Path toPath = doneDirFc.makeQualified(new Path(targetDir, confFile 442 .getName())); 443 if (!toPath.equals(confFile)) { 444 moveToDoneNow(confFile, toPath); 445 confFile = toPath; 446 } 447 } 448 state = HistoryInfoState.IN_DONE; 449 } catch (Throwable t) { 450 LOG.error("Error while trying to move a job to done", t); 451 this.state = HistoryInfoState.MOVE_FAILED; 452 } 453 } 454 455 /** 456 * Parse a job from the JobHistoryFile, if the underlying file is not going 457 * to be deleted. 458 * 459 * @return the Job or null if the underlying file was deleted. 460 * @throws IOException 461 * if there is an error trying to read the file. 462 */ 463 public synchronized Job loadJob() throws IOException { 464 return new CompletedJob(conf, jobIndexInfo.getJobId(), historyFile, 465 false, jobIndexInfo.getUser(), this, aclsMgr); 466 } 467 468 /** 469 * Return the history file. This should only be used for testing. 470 * @return the history file. 471 */ 472 synchronized Path getHistoryFile() { 473 return historyFile; 474 } 475 476 protected synchronized void delete() throws IOException { 477 if (LOG.isDebugEnabled()) { 478 LOG.debug("deleting " + historyFile + " and " + confFile); 479 } 480 state = HistoryInfoState.DELETED; 481 doneDirFc.delete(doneDirFc.makeQualified(historyFile), false); 482 doneDirFc.delete(doneDirFc.makeQualified(confFile), false); 483 } 484 485 public JobIndexInfo getJobIndexInfo() { 486 return jobIndexInfo; 487 } 488 489 public JobId getJobId() { 490 return jobIndexInfo.getJobId(); 491 } 492 493 public synchronized Path getConfFile() { 494 return confFile; 495 } 496 497 public synchronized Configuration loadConfFile() throws IOException { 498 FileContext fc = FileContext.getFileContext(confFile.toUri(), conf); 499 Configuration jobConf = new Configuration(false); 500 jobConf.addResource(fc.open(confFile), confFile.toString(), true); 501 return jobConf; 502 } 503 } 504 505 private SerialNumberIndex serialNumberIndex = null; 506 protected JobListCache jobListCache = null; 507 508 // Maintains a list of known done subdirectories. 509 private final Set<Path> existingDoneSubdirs = Collections 510 .synchronizedSet(new HashSet<Path>()); 511 512 /** 513 * Maintains a mapping between intermediate user directories and the last 514 * known modification time. 515 */ 516 private ConcurrentMap<String, UserLogDir> userDirModificationTimeMap = 517 new ConcurrentHashMap<String, UserLogDir>(); 518 519 private JobACLsManager aclsMgr; 520 521 @VisibleForTesting 522 Configuration conf; 523 524 private String serialNumberFormat; 525 526 private Path doneDirPrefixPath = null; // folder for completed jobs 527 private FileContext doneDirFc; // done Dir FileContext 528 529 private Path intermediateDoneDirPath = null; // Intermediate Done Dir Path 530 private FileContext intermediateDoneDirFc; // Intermediate Done Dir 531 // FileContext 532 @VisibleForTesting 533 protected ThreadPoolExecutor moveToDoneExecutor = null; 534 private long maxHistoryAge = 0; 535 536 public HistoryFileManager() { 537 super(HistoryFileManager.class.getName()); 538 } 539 540 @Override 541 protected void serviceInit(Configuration conf) throws Exception { 542 this.conf = conf; 543 544 int serialNumberLowDigits = 3; 545 serialNumberFormat = ("%0" 546 + (JobHistoryUtils.SERIAL_NUMBER_DIRECTORY_DIGITS + serialNumberLowDigits) 547 + "d"); 548 549 long maxFSWaitTime = conf.getLong( 550 JHAdminConfig.MR_HISTORY_MAX_START_WAIT_TIME, 551 JHAdminConfig.DEFAULT_MR_HISTORY_MAX_START_WAIT_TIME); 552 createHistoryDirs(new SystemClock(), 10 * 1000, maxFSWaitTime); 553 554 this.aclsMgr = new JobACLsManager(conf); 555 556 maxHistoryAge = conf.getLong(JHAdminConfig.MR_HISTORY_MAX_AGE_MS, 557 JHAdminConfig.DEFAULT_MR_HISTORY_MAX_AGE); 558 559 jobListCache = createJobListCache(); 560 561 serialNumberIndex = new SerialNumberIndex(conf.getInt( 562 JHAdminConfig.MR_HISTORY_DATESTRING_CACHE_SIZE, 563 JHAdminConfig.DEFAULT_MR_HISTORY_DATESTRING_CACHE_SIZE)); 564 565 int numMoveThreads = conf.getInt( 566 JHAdminConfig.MR_HISTORY_MOVE_THREAD_COUNT, 567 JHAdminConfig.DEFAULT_MR_HISTORY_MOVE_THREAD_COUNT); 568 ThreadFactory tf = new ThreadFactoryBuilder().setNameFormat( 569 "MoveIntermediateToDone Thread #%d").build(); 570 moveToDoneExecutor = new ThreadPoolExecutor(numMoveThreads, numMoveThreads, 571 1, TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>(), tf); 572 573 super.serviceInit(conf); 574 } 575 576 @VisibleForTesting 577 void createHistoryDirs(Clock clock, long intervalCheckMillis, 578 long timeOutMillis) throws IOException { 579 long start = clock.getTime(); 580 boolean done = false; 581 int counter = 0; 582 while (!done && 583 ((timeOutMillis == -1) || (clock.getTime() - start < timeOutMillis))) { 584 done = tryCreatingHistoryDirs(counter++ % 3 == 0); // log every 3 attempts, 30sec 585 try { 586 Thread.sleep(intervalCheckMillis); 587 } catch (InterruptedException ex) { 588 throw new YarnRuntimeException(ex); 589 } 590 } 591 if (!done) { 592 throw new YarnRuntimeException("Timed out '" + timeOutMillis+ 593 "ms' waiting for FileSystem to become available"); 594 } 595 } 596 597 /** 598 * DistributedFileSystem returns a RemoteException with a message stating 599 * SafeModeException in it. So this is only way to check it is because of 600 * being in safe mode. 601 */ 602 private boolean isBecauseSafeMode(Throwable ex) { 603 return ex.toString().contains("SafeModeException"); 604 } 605 606 /** 607 * Returns TRUE if the history dirs were created, FALSE if they could not 608 * be created because the FileSystem is not reachable or in safe mode and 609 * throws and exception otherwise. 610 */ 611 @VisibleForTesting 612 boolean tryCreatingHistoryDirs(boolean logWait) throws IOException { 613 boolean succeeded = true; 614 String doneDirPrefix = JobHistoryUtils. 615 getConfiguredHistoryServerDoneDirPrefix(conf); 616 try { 617 doneDirPrefixPath = FileContext.getFileContext(conf).makeQualified( 618 new Path(doneDirPrefix)); 619 doneDirFc = FileContext.getFileContext(doneDirPrefixPath.toUri(), conf); 620 doneDirFc.setUMask(JobHistoryUtils.HISTORY_DONE_DIR_UMASK); 621 mkdir(doneDirFc, doneDirPrefixPath, new FsPermission( 622 JobHistoryUtils.HISTORY_DONE_DIR_PERMISSION)); 623 } catch (ConnectException ex) { 624 if (logWait) { 625 LOG.info("Waiting for FileSystem at " + 626 doneDirPrefixPath.toUri().getAuthority() + "to be available"); 627 } 628 succeeded = false; 629 } catch (IOException e) { 630 if (isBecauseSafeMode(e)) { 631 succeeded = false; 632 if (logWait) { 633 LOG.info("Waiting for FileSystem at " + 634 doneDirPrefixPath.toUri().getAuthority() + 635 "to be out of safe mode"); 636 } 637 } else { 638 throw new YarnRuntimeException("Error creating done directory: [" 639 + doneDirPrefixPath + "]", e); 640 } 641 } 642 if (succeeded) { 643 String intermediateDoneDirPrefix = JobHistoryUtils. 644 getConfiguredHistoryIntermediateDoneDirPrefix(conf); 645 try { 646 intermediateDoneDirPath = FileContext.getFileContext(conf).makeQualified( 647 new Path(intermediateDoneDirPrefix)); 648 intermediateDoneDirFc = FileContext.getFileContext( 649 intermediateDoneDirPath.toUri(), conf); 650 mkdir(intermediateDoneDirFc, intermediateDoneDirPath, new FsPermission( 651 JobHistoryUtils.HISTORY_INTERMEDIATE_DONE_DIR_PERMISSIONS.toShort())); 652 } catch (ConnectException ex) { 653 succeeded = false; 654 if (logWait) { 655 LOG.info("Waiting for FileSystem at " + 656 intermediateDoneDirPath.toUri().getAuthority() + 657 "to be available"); 658 } 659 } catch (IOException e) { 660 if (isBecauseSafeMode(e)) { 661 succeeded = false; 662 if (logWait) { 663 LOG.info("Waiting for FileSystem at " + 664 intermediateDoneDirPath.toUri().getAuthority() + 665 "to be out of safe mode"); 666 } 667 } else { 668 throw new YarnRuntimeException( 669 "Error creating intermediate done directory: [" 670 + intermediateDoneDirPath + "]", e); 671 } 672 } 673 } 674 return succeeded; 675 } 676 677 @Override 678 public void serviceStop() throws Exception { 679 ShutdownThreadsHelper.shutdownExecutorService(moveToDoneExecutor); 680 super.serviceStop(); 681 } 682 683 protected JobListCache createJobListCache() { 684 return new JobListCache(conf.getInt( 685 JHAdminConfig.MR_HISTORY_JOBLIST_CACHE_SIZE, 686 JHAdminConfig.DEFAULT_MR_HISTORY_JOBLIST_CACHE_SIZE), maxHistoryAge); 687 } 688 689 private void mkdir(FileContext fc, Path path, FsPermission fsp) 690 throws IOException { 691 if (!fc.util().exists(path)) { 692 try { 693 fc.mkdir(path, fsp, true); 694 695 FileStatus fsStatus = fc.getFileStatus(path); 696 LOG.info("Perms after creating " + fsStatus.getPermission().toShort() 697 + ", Expected: " + fsp.toShort()); 698 if (fsStatus.getPermission().toShort() != fsp.toShort()) { 699 LOG.info("Explicitly setting permissions to : " + fsp.toShort() 700 + ", " + fsp); 701 fc.setPermission(path, fsp); 702 } 703 } catch (FileAlreadyExistsException e) { 704 LOG.info("Directory: [" + path + "] already exists."); 705 } 706 } 707 } 708 709 /** 710 * Populates index data structures. Should only be called at initialization 711 * times. 712 */ 713 @SuppressWarnings("unchecked") 714 void initExisting() throws IOException { 715 LOG.info("Initializing Existing Jobs..."); 716 List<FileStatus> timestampedDirList = findTimestampedDirectories(); 717 // Sort first just so insertion is in a consistent order 718 Collections.sort(timestampedDirList); 719 for (FileStatus fs : timestampedDirList) { 720 // TODO Could verify the correct format for these directories. 721 addDirectoryToSerialNumberIndex(fs.getPath()); 722 } 723 for (int i= timestampedDirList.size() - 1; 724 i >= 0 && !jobListCache.isFull(); i--) { 725 FileStatus fs = timestampedDirList.get(i); 726 addDirectoryToJobListCache(fs.getPath()); 727 } 728 } 729 730 private void removeDirectoryFromSerialNumberIndex(Path serialDirPath) { 731 String serialPart = serialDirPath.getName(); 732 String timeStampPart = JobHistoryUtils 733 .getTimestampPartFromPath(serialDirPath.toString()); 734 if (timeStampPart == null) { 735 LOG.warn("Could not find timestamp portion from path: " 736 + serialDirPath.toString() + ". Continuing with next"); 737 return; 738 } 739 if (serialPart == null) { 740 LOG.warn("Could not find serial portion from path: " 741 + serialDirPath.toString() + ". Continuing with next"); 742 return; 743 } 744 serialNumberIndex.remove(serialPart, timeStampPart); 745 } 746 747 private void addDirectoryToSerialNumberIndex(Path serialDirPath) { 748 if (LOG.isDebugEnabled()) { 749 LOG.debug("Adding " + serialDirPath + " to serial index"); 750 } 751 String serialPart = serialDirPath.getName(); 752 String timestampPart = JobHistoryUtils 753 .getTimestampPartFromPath(serialDirPath.toString()); 754 if (timestampPart == null) { 755 LOG.warn("Could not find timestamp portion from path: " + serialDirPath 756 + ". Continuing with next"); 757 return; 758 } 759 if (serialPart == null) { 760 LOG.warn("Could not find serial portion from path: " 761 + serialDirPath.toString() + ". Continuing with next"); 762 } else { 763 serialNumberIndex.add(serialPart, timestampPart); 764 } 765 } 766 767 private void addDirectoryToJobListCache(Path path) throws IOException { 768 if (LOG.isDebugEnabled()) { 769 LOG.debug("Adding " + path + " to job list cache."); 770 } 771 List<FileStatus> historyFileList = scanDirectoryForHistoryFiles(path, 772 doneDirFc); 773 for (FileStatus fs : historyFileList) { 774 if (LOG.isDebugEnabled()) { 775 LOG.debug("Adding in history for " + fs.getPath()); 776 } 777 JobIndexInfo jobIndexInfo = FileNameIndexUtils.getIndexInfo(fs.getPath() 778 .getName()); 779 String confFileName = JobHistoryUtils 780 .getIntermediateConfFileName(jobIndexInfo.getJobId()); 781 String summaryFileName = JobHistoryUtils 782 .getIntermediateSummaryFileName(jobIndexInfo.getJobId()); 783 HistoryFileInfo fileInfo = new HistoryFileInfo(fs.getPath(), new Path(fs 784 .getPath().getParent(), confFileName), new Path(fs.getPath() 785 .getParent(), summaryFileName), jobIndexInfo, true); 786 jobListCache.addIfAbsent(fileInfo); 787 } 788 } 789 790 @VisibleForTesting 791 protected static List<FileStatus> scanDirectory(Path path, FileContext fc, 792 PathFilter pathFilter) throws IOException { 793 path = fc.makeQualified(path); 794 List<FileStatus> jhStatusList = new ArrayList<FileStatus>(); 795 try { 796 RemoteIterator<FileStatus> fileStatusIter = fc.listStatus(path); 797 while (fileStatusIter.hasNext()) { 798 FileStatus fileStatus = fileStatusIter.next(); 799 Path filePath = fileStatus.getPath(); 800 if (fileStatus.isFile() && pathFilter.accept(filePath)) { 801 jhStatusList.add(fileStatus); 802 } 803 } 804 } catch (FileNotFoundException fe) { 805 LOG.error("Error while scanning directory " + path, fe); 806 } 807 return jhStatusList; 808 } 809 810 protected List<FileStatus> scanDirectoryForHistoryFiles(Path path, 811 FileContext fc) throws IOException { 812 return scanDirectory(path, fc, JobHistoryUtils.getHistoryFileFilter()); 813 } 814 815 /** 816 * Finds all history directories with a timestamp component by scanning the 817 * filesystem. Used when the JobHistory server is started. 818 * 819 * @return list of history directories 820 */ 821 protected List<FileStatus> findTimestampedDirectories() throws IOException { 822 List<FileStatus> fsList = JobHistoryUtils.localGlobber(doneDirFc, 823 doneDirPrefixPath, DONE_BEFORE_SERIAL_TAIL); 824 return fsList; 825 } 826 827 /** 828 * Scans the intermediate directory to find user directories. Scans these for 829 * history files if the modification time for the directory has changed. Once 830 * it finds history files it starts the process of moving them to the done 831 * directory. 832 * 833 * @throws IOException 834 * if there was a error while scanning 835 */ 836 void scanIntermediateDirectory() throws IOException { 837 // TODO it would be great to limit how often this happens, except in the 838 // case where we are looking for a particular job. 839 List<FileStatus> userDirList = JobHistoryUtils.localGlobber( 840 intermediateDoneDirFc, intermediateDoneDirPath, ""); 841 LOG.debug("Scanning intermediate dirs"); 842 for (FileStatus userDir : userDirList) { 843 String name = userDir.getPath().getName(); 844 UserLogDir dir = userDirModificationTimeMap.get(name); 845 if(dir == null) { 846 dir = new UserLogDir(); 847 UserLogDir old = userDirModificationTimeMap.putIfAbsent(name, dir); 848 if(old != null) { 849 dir = old; 850 } 851 } 852 dir.scanIfNeeded(userDir); 853 } 854 } 855 856 /** 857 * Scans the specified path and populates the intermediate cache. 858 * 859 * @param absPath 860 * @throws IOException 861 */ 862 private void scanIntermediateDirectory(final Path absPath) throws IOException { 863 if (LOG.isDebugEnabled()) { 864 LOG.debug("Scanning intermediate dir " + absPath); 865 } 866 List<FileStatus> fileStatusList = scanDirectoryForHistoryFiles(absPath, 867 intermediateDoneDirFc); 868 if (LOG.isDebugEnabled()) { 869 LOG.debug("Found " + fileStatusList.size() + " files"); 870 } 871 for (FileStatus fs : fileStatusList) { 872 if (LOG.isDebugEnabled()) { 873 LOG.debug("scanning file: "+ fs.getPath()); 874 } 875 JobIndexInfo jobIndexInfo = FileNameIndexUtils.getIndexInfo(fs.getPath() 876 .getName()); 877 String confFileName = JobHistoryUtils 878 .getIntermediateConfFileName(jobIndexInfo.getJobId()); 879 String summaryFileName = JobHistoryUtils 880 .getIntermediateSummaryFileName(jobIndexInfo.getJobId()); 881 HistoryFileInfo fileInfo = new HistoryFileInfo(fs.getPath(), new Path(fs 882 .getPath().getParent(), confFileName), new Path(fs.getPath() 883 .getParent(), summaryFileName), jobIndexInfo, false); 884 885 final HistoryFileInfo old = jobListCache.addIfAbsent(fileInfo); 886 if (old == null || old.didMoveFail()) { 887 final HistoryFileInfo found = (old == null) ? fileInfo : old; 888 long cutoff = System.currentTimeMillis() - maxHistoryAge; 889 if(found.getJobIndexInfo().getFinishTime() <= cutoff) { 890 try { 891 found.delete(); 892 } catch (IOException e) { 893 LOG.warn("Error cleaning up a HistoryFile that is out of date.", e); 894 } 895 } else { 896 if (LOG.isDebugEnabled()) { 897 LOG.debug("Scheduling move to done of " +found); 898 } 899 moveToDoneExecutor.execute(new Runnable() { 900 @Override 901 public void run() { 902 try { 903 found.moveToDone(); 904 } catch (IOException e) { 905 LOG.info("Failed to process fileInfo for job: " + 906 found.getJobId(), e); 907 } 908 } 909 }); 910 } 911 } else if (!old.isMovePending()) { 912 //This is a duplicate so just delete it 913 if (LOG.isDebugEnabled()) { 914 LOG.debug("Duplicate: deleting"); 915 } 916 fileInfo.delete(); 917 } 918 } 919 } 920 921 /** 922 * Searches the job history file FileStatus list for the specified JobId. 923 * 924 * @param fileStatusList 925 * fileStatus list of Job History Files. 926 * @param jobId 927 * The JobId to find. 928 * @return A FileInfo object for the jobId, null if not found. 929 * @throws IOException 930 */ 931 private HistoryFileInfo getJobFileInfo(List<FileStatus> fileStatusList, 932 JobId jobId) throws IOException { 933 for (FileStatus fs : fileStatusList) { 934 JobIndexInfo jobIndexInfo = FileNameIndexUtils.getIndexInfo(fs.getPath() 935 .getName()); 936 if (jobIndexInfo.getJobId().equals(jobId)) { 937 String confFileName = JobHistoryUtils 938 .getIntermediateConfFileName(jobIndexInfo.getJobId()); 939 String summaryFileName = JobHistoryUtils 940 .getIntermediateSummaryFileName(jobIndexInfo.getJobId()); 941 HistoryFileInfo fileInfo = new HistoryFileInfo(fs.getPath(), new Path( 942 fs.getPath().getParent(), confFileName), new Path(fs.getPath() 943 .getParent(), summaryFileName), jobIndexInfo, true); 944 return fileInfo; 945 } 946 } 947 return null; 948 } 949 950 /** 951 * Scans old directories known by the idToDateString map for the specified 952 * jobId. If the number of directories is higher than the supported size of 953 * the idToDateString cache, the jobId will not be found. 954 * 955 * @param jobId 956 * the jobId. 957 * @return 958 * @throws IOException 959 */ 960 private HistoryFileInfo scanOldDirsForJob(JobId jobId) throws IOException { 961 String boxedSerialNumber = JobHistoryUtils.serialNumberDirectoryComponent( 962 jobId, serialNumberFormat); 963 Set<String> dateStringSet = serialNumberIndex.get(boxedSerialNumber); 964 if (dateStringSet == null) { 965 return null; 966 } 967 for (String timestampPart : dateStringSet) { 968 Path logDir = canonicalHistoryLogPath(jobId, timestampPart); 969 List<FileStatus> fileStatusList = scanDirectoryForHistoryFiles(logDir, 970 doneDirFc); 971 HistoryFileInfo fileInfo = getJobFileInfo(fileStatusList, jobId); 972 if (fileInfo != null) { 973 return fileInfo; 974 } 975 } 976 return null; 977 } 978 979 public Collection<HistoryFileInfo> getAllFileInfo() throws IOException { 980 scanIntermediateDirectory(); 981 return jobListCache.values(); 982 } 983 984 public HistoryFileInfo getFileInfo(JobId jobId) throws IOException { 985 // FileInfo available in cache. 986 HistoryFileInfo fileInfo = jobListCache.get(jobId); 987 if (fileInfo != null) { 988 return fileInfo; 989 } 990 // OK so scan the intermediate to be sure we did not lose it that way 991 scanIntermediateDirectory(); 992 fileInfo = jobListCache.get(jobId); 993 if (fileInfo != null) { 994 return fileInfo; 995 } 996 997 // Intermediate directory does not contain job. Search through older ones. 998 fileInfo = scanOldDirsForJob(jobId); 999 if (fileInfo != null) { 1000 return fileInfo; 1001 } 1002 return null; 1003 } 1004 1005 private void moveToDoneNow(final Path src, final Path target) 1006 throws IOException { 1007 LOG.info("Moving " + src.toString() + " to " + target.toString()); 1008 intermediateDoneDirFc.rename(src, target, Options.Rename.NONE); 1009 } 1010 1011 private String getJobSummary(FileContext fc, Path path) throws IOException { 1012 Path qPath = fc.makeQualified(path); 1013 FSDataInputStream in = null; 1014 String jobSummaryString = null; 1015 try { 1016 in = fc.open(qPath); 1017 jobSummaryString = in.readUTF(); 1018 } finally { 1019 if (in != null) { 1020 in.close(); 1021 } 1022 } 1023 return jobSummaryString; 1024 } 1025 1026 private void makeDoneSubdir(Path path) throws IOException { 1027 try { 1028 doneDirFc.getFileStatus(path); 1029 existingDoneSubdirs.add(path); 1030 } catch (FileNotFoundException fnfE) { 1031 try { 1032 FsPermission fsp = new FsPermission( 1033 JobHistoryUtils.HISTORY_DONE_DIR_PERMISSION); 1034 doneDirFc.mkdir(path, fsp, true); 1035 FileStatus fsStatus = doneDirFc.getFileStatus(path); 1036 LOG.info("Perms after creating " + fsStatus.getPermission().toShort() 1037 + ", Expected: " + fsp.toShort()); 1038 if (fsStatus.getPermission().toShort() != fsp.toShort()) { 1039 LOG.info("Explicitly setting permissions to : " + fsp.toShort() 1040 + ", " + fsp); 1041 doneDirFc.setPermission(path, fsp); 1042 } 1043 existingDoneSubdirs.add(path); 1044 } catch (FileAlreadyExistsException faeE) { // Nothing to do. 1045 } 1046 } 1047 } 1048 1049 private Path canonicalHistoryLogPath(JobId id, String timestampComponent) { 1050 return new Path(doneDirPrefixPath, JobHistoryUtils.historyLogSubdirectory( 1051 id, timestampComponent, serialNumberFormat)); 1052 } 1053 1054 private Path canonicalHistoryLogPath(JobId id, long millisecondTime) { 1055 String timestampComponent = JobHistoryUtils 1056 .timestampDirectoryComponent(millisecondTime); 1057 return new Path(doneDirPrefixPath, JobHistoryUtils.historyLogSubdirectory( 1058 id, timestampComponent, serialNumberFormat)); 1059 } 1060 1061 private long getEffectiveTimestamp(long finishTime, FileStatus fileStatus) { 1062 if (finishTime == 0) { 1063 return fileStatus.getModificationTime(); 1064 } 1065 return finishTime; 1066 } 1067 1068 private void deleteJobFromDone(HistoryFileInfo fileInfo) throws IOException { 1069 jobListCache.delete(fileInfo); 1070 fileInfo.delete(); 1071 } 1072 1073 List<FileStatus> getHistoryDirsForCleaning(long cutoff) throws IOException { 1074 return JobHistoryUtils. 1075 getHistoryDirsForCleaning(doneDirFc, doneDirPrefixPath, cutoff); 1076 } 1077 1078 /** 1079 * Clean up older history files. 1080 * 1081 * @throws IOException 1082 * on any error trying to remove the entries. 1083 */ 1084 @SuppressWarnings("unchecked") 1085 void clean() throws IOException { 1086 long cutoff = System.currentTimeMillis() - maxHistoryAge; 1087 boolean halted = false; 1088 List<FileStatus> serialDirList = getHistoryDirsForCleaning(cutoff); 1089 // Sort in ascending order. Relies on YYYY/MM/DD/Serial 1090 Collections.sort(serialDirList); 1091 for (FileStatus serialDir : serialDirList) { 1092 List<FileStatus> historyFileList = scanDirectoryForHistoryFiles( 1093 serialDir.getPath(), doneDirFc); 1094 for (FileStatus historyFile : historyFileList) { 1095 JobIndexInfo jobIndexInfo = FileNameIndexUtils.getIndexInfo(historyFile 1096 .getPath().getName()); 1097 long effectiveTimestamp = getEffectiveTimestamp( 1098 jobIndexInfo.getFinishTime(), historyFile); 1099 if (effectiveTimestamp <= cutoff) { 1100 HistoryFileInfo fileInfo = this.jobListCache.get(jobIndexInfo 1101 .getJobId()); 1102 if (fileInfo == null) { 1103 String confFileName = JobHistoryUtils 1104 .getIntermediateConfFileName(jobIndexInfo.getJobId()); 1105 1106 fileInfo = new HistoryFileInfo(historyFile.getPath(), new Path( 1107 historyFile.getPath().getParent(), confFileName), null, 1108 jobIndexInfo, true); 1109 } 1110 deleteJobFromDone(fileInfo); 1111 } else { 1112 halted = true; 1113 break; 1114 } 1115 } 1116 if (!halted) { 1117 deleteDir(serialDir); 1118 removeDirectoryFromSerialNumberIndex(serialDir.getPath()); 1119 existingDoneSubdirs.remove(serialDir.getPath()); 1120 } else { 1121 break; // Don't scan any more directories. 1122 } 1123 } 1124 } 1125 1126 protected boolean deleteDir(FileStatus serialDir) 1127 throws AccessControlException, FileNotFoundException, 1128 UnsupportedFileSystemException, IOException { 1129 return doneDirFc.delete(doneDirFc.makeQualified(serialDir.getPath()), true); 1130 } 1131 1132 @VisibleForTesting 1133 protected void setMaxHistoryAge(long newValue){ 1134 maxHistoryAge=newValue; 1135 } 1136}