001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.mapreduce; 020 021import java.io.IOException; 022 023import org.apache.hadoop.classification.InterfaceAudience; 024import org.apache.hadoop.classification.InterfaceStability; 025/** 026 * <code>OutputCommitter</code> describes the commit of task output for a 027 * Map-Reduce job. 028 * 029 * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of 030 * the job to:<p> 031 * <ol> 032 * <li> 033 * Setup the job during initialization. For example, create the temporary 034 * output directory for the job during the initialization of the job. 035 * </li> 036 * <li> 037 * Cleanup the job after the job completion. For example, remove the 038 * temporary output directory after the job completion. 039 * </li> 040 * <li> 041 * Setup the task temporary output. 042 * </li> 043 * <li> 044 * Check whether a task needs a commit. This is to avoid the commit 045 * procedure if a task does not need commit. 046 * </li> 047 * <li> 048 * Commit of the task output. 049 * </li> 050 * <li> 051 * Discard the task commit. 052 * </li> 053 * </ol> 054 * The methods in this class can be called from several different processes and 055 * from several different contexts. It is important to know which process and 056 * which context each is called from. Each method should be marked accordingly 057 * in its documentation. It is also important to note that not all methods are 058 * guaranteed to be called once and only once. If a method is not guaranteed to 059 * have this property the output committer needs to handle this appropriately. 060 * Also note it will only be in rare situations where they may be called 061 * multiple times for the same task. 062 * 063 * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 064 * @see JobContext 065 * @see TaskAttemptContext 066 */ 067@InterfaceAudience.Public 068@InterfaceStability.Stable 069public abstract class OutputCommitter { 070 /** 071 * For the framework to setup the job output during initialization. This is 072 * called from the application master process for the entire job. This will be 073 * called multiple times, once per job attempt. 074 * 075 * @param jobContext Context of the job whose output is being written. 076 * @throws IOException if temporary output could not be created 077 */ 078 public abstract void setupJob(JobContext jobContext) throws IOException; 079 080 /** 081 * For cleaning up the job's output after job completion. This is called 082 * from the application master process for the entire job. This may be called 083 * multiple times. 084 * 085 * @param jobContext Context of the job whose output is being written. 086 * @throws IOException 087 * @deprecated Use {@link #commitJob(JobContext)} and 088 * {@link #abortJob(JobContext, JobStatus.State)} instead. 089 */ 090 @Deprecated 091 public void cleanupJob(JobContext jobContext) throws IOException { } 092 093 /** 094 * For committing job's output after successful job completion. Note that this 095 * is invoked for jobs with final runstate as SUCCESSFUL. This is called 096 * from the application master process for the entire job. This is guaranteed 097 * to only be called once. If it throws an exception the entire job will 098 * fail. 099 * 100 * @param jobContext Context of the job whose output is being written. 101 * @throws IOException 102 */ 103 public void commitJob(JobContext jobContext) throws IOException { 104 cleanupJob(jobContext); 105 } 106 107 108 /** 109 * For aborting an unsuccessful job's output. Note that this is invoked for 110 * jobs with final runstate as {@link JobStatus.State#FAILED} or 111 * {@link JobStatus.State#KILLED}. This is called from the application 112 * master process for the entire job. This may be called multiple times. 113 * 114 * @param jobContext Context of the job whose output is being written. 115 * @param state final runstate of the job 116 * @throws IOException 117 */ 118 public void abortJob(JobContext jobContext, JobStatus.State state) 119 throws IOException { 120 cleanupJob(jobContext); 121 } 122 123 /** 124 * Sets up output for the task. This is called from each individual task's 125 * process that will output to HDFS, and it is called just for that task. This 126 * may be called multiple times for the same task, but for different task 127 * attempts. 128 * 129 * @param taskContext Context of the task whose output is being written. 130 * @throws IOException 131 */ 132 public abstract void setupTask(TaskAttemptContext taskContext) 133 throws IOException; 134 135 /** 136 * Check whether task needs a commit. This is called from each individual 137 * task's process that will output to HDFS, and it is called just for that 138 * task. 139 * 140 * @param taskContext 141 * @return true/false 142 * @throws IOException 143 */ 144 public abstract boolean needsTaskCommit(TaskAttemptContext taskContext) 145 throws IOException; 146 147 /** 148 * To promote the task's temporary output to final output location. 149 * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this 150 * task is the task that the AM determines finished first, this method 151 * is called to commit an individual task's output. This is to mark 152 * that tasks output as complete, as {@link #commitJob(JobContext)} will 153 * also be called later on if the entire job finished successfully. This 154 * is called from a task's process. This may be called multiple times for the 155 * same task, but different task attempts. It should be very rare for this to 156 * be called multiple times and requires odd networking failures to make this 157 * happen. In the future the Hadoop framework may eliminate this race. 158 * 159 * @param taskContext Context of the task whose output is being written. 160 * @throws IOException if commit is not successful. 161 */ 162 public abstract void commitTask(TaskAttemptContext taskContext) 163 throws IOException; 164 165 /** 166 * Discard the task output. This is called from a task's process to clean 167 * up a single task's output that can not yet been committed. This may be 168 * called multiple times for the same task, but for different task attempts. 169 * 170 * @param taskContext 171 * @throws IOException 172 */ 173 public abstract void abortTask(TaskAttemptContext taskContext) 174 throws IOException; 175 176 /** 177 * Is task output recovery supported for restarting jobs? 178 * 179 * If task output recovery is supported, job restart can be done more 180 * efficiently. 181 * 182 * @return <code>true</code> if task output recovery is supported, 183 * <code>false</code> otherwise 184 * @see #recoverTask(TaskAttemptContext) 185 * @deprecated Use {@link #isRecoverySupported(JobContext)} instead. 186 */ 187 @Deprecated 188 public boolean isRecoverySupported() { 189 return false; 190 } 191 192 /** 193 * Is task output recovery supported for restarting jobs? 194 * 195 * If task output recovery is supported, job restart can be done more 196 * efficiently. 197 * 198 * @param jobContext 199 * Context of the job whose output is being written. 200 * @return <code>true</code> if task output recovery is supported, 201 * <code>false</code> otherwise 202 * @throws IOException 203 * @see #recoverTask(TaskAttemptContext) 204 */ 205 public boolean isRecoverySupported(JobContext jobContext) throws IOException { 206 return isRecoverySupported(); 207 } 208 209 /** 210 * Recover the task output. 211 * 212 * The retry-count for the job will be passed via the 213 * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in 214 * {@link TaskAttemptContext#getConfiguration()} for the 215 * <code>OutputCommitter</code>. This is called from the application master 216 * process, but it is called individually for each task. 217 * 218 * If an exception is thrown the task will be attempted again. 219 * 220 * This may be called multiple times for the same task. But from different 221 * application attempts. 222 * 223 * @param taskContext Context of the task whose output is being recovered 224 * @throws IOException 225 */ 226 public void recoverTask(TaskAttemptContext taskContext) 227 throws IOException 228 {} 229}