001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.mapreduce;
020
021import java.io.IOException;
022
023import org.apache.hadoop.classification.InterfaceAudience;
024import org.apache.hadoop.classification.InterfaceStability;
025/**
026 * <code>OutputCommitter</code> describes the commit of task output for a 
027 * Map-Reduce job.
028 *
029 * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of 
030 * the job to:<p>
031 * <ol>
032 *   <li>
033 *   Setup the job during initialization. For example, create the temporary 
034 *   output directory for the job during the initialization of the job.
035 *   </li>
036 *   <li>
037 *   Cleanup the job after the job completion. For example, remove the
038 *   temporary output directory after the job completion. 
039 *   </li>
040 *   <li>
041 *   Setup the task temporary output.
042 *   </li> 
043 *   <li>
044 *   Check whether a task needs a commit. This is to avoid the commit
045 *   procedure if a task does not need commit.
046 *   </li>
047 *   <li>
048 *   Commit of the task output.
049 *   </li>  
050 *   <li>
051 *   Discard the task commit.
052 *   </li>
053 * </ol>
054 * The methods in this class can be called from several different processes and
055 * from several different contexts.  It is important to know which process and
056 * which context each is called from.  Each method should be marked accordingly
057 * in its documentation.  It is also important to note that not all methods are
058 * guaranteed to be called once and only once.  If a method is not guaranteed to
059 * have this property the output committer needs to handle this appropriately. 
060 * Also note it will only be in rare situations where they may be called 
061 * multiple times for the same task.
062 * 
063 * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 
064 * @see JobContext
065 * @see TaskAttemptContext 
066 */
067@InterfaceAudience.Public
068@InterfaceStability.Stable
069public abstract class OutputCommitter {
070  /**
071   * For the framework to setup the job output during initialization.  This is
072   * called from the application master process for the entire job. This will be
073   * called multiple times, once per job attempt.
074   * 
075   * @param jobContext Context of the job whose output is being written.
076   * @throws IOException if temporary output could not be created
077   */
078  public abstract void setupJob(JobContext jobContext) throws IOException;
079
080  /**
081   * For cleaning up the job's output after job completion.  This is called
082   * from the application master process for the entire job. This may be called
083   * multiple times.
084   * 
085   * @param jobContext Context of the job whose output is being written.
086   * @throws IOException
087   * @deprecated Use {@link #commitJob(JobContext)} and
088   *                 {@link #abortJob(JobContext, JobStatus.State)} instead.
089   */
090  @Deprecated
091  public void cleanupJob(JobContext jobContext) throws IOException { }
092
093  /**
094   * For committing job's output after successful job completion. Note that this
095   * is invoked for jobs with final runstate as SUCCESSFUL.  This is called
096   * from the application master process for the entire job. This is guaranteed
097   * to only be called once.  If it throws an exception the entire job will
098   * fail.      
099   * 
100   * @param jobContext Context of the job whose output is being written.
101   * @throws IOException
102   */
103  public void commitJob(JobContext jobContext) throws IOException {
104    cleanupJob(jobContext);
105  }
106
107  
108  /**
109   * For aborting an unsuccessful job's output. Note that this is invoked for 
110   * jobs with final runstate as {@link JobStatus.State#FAILED} or 
111   * {@link JobStatus.State#KILLED}.  This is called from the application
112   * master process for the entire job. This may be called multiple times.
113   *
114   * @param jobContext Context of the job whose output is being written.
115   * @param state final runstate of the job
116   * @throws IOException
117   */
118  public void abortJob(JobContext jobContext, JobStatus.State state) 
119  throws IOException {
120    cleanupJob(jobContext);
121  }
122  
123  /**
124   * Sets up output for the task.  This is called from each individual task's
125   * process that will output to HDFS, and it is called just for that task. This
126   * may be called multiple times for the same task, but for different task
127   * attempts.
128   * 
129   * @param taskContext Context of the task whose output is being written.
130   * @throws IOException
131   */
132  public abstract void setupTask(TaskAttemptContext taskContext)
133  throws IOException;
134  
135  /**
136   * Check whether task needs a commit.  This is called from each individual
137   * task's process that will output to HDFS, and it is called just for that
138   * task.
139   * 
140   * @param taskContext
141   * @return true/false
142   * @throws IOException
143   */
144  public abstract boolean needsTaskCommit(TaskAttemptContext taskContext)
145  throws IOException;
146
147  /**
148   * To promote the task's temporary output to final output location.
149   * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this
150   * task is the task that the AM determines finished first, this method
151   * is called to commit an individual task's output.  This is to mark
152   * that tasks output as complete, as {@link #commitJob(JobContext)} will 
153   * also be called later on if the entire job finished successfully. This
154   * is called from a task's process. This may be called multiple times for the
155   * same task, but different task attempts.  It should be very rare for this to
156   * be called multiple times and requires odd networking failures to make this
157   * happen. In the future the Hadoop framework may eliminate this race.
158   * 
159   * @param taskContext Context of the task whose output is being written.
160   * @throws IOException if commit is not successful. 
161   */
162  public abstract void commitTask(TaskAttemptContext taskContext)
163  throws IOException;
164  
165  /**
166   * Discard the task output. This is called from a task's process to clean 
167   * up a single task's output that can not yet been committed. This may be
168   * called multiple times for the same task, but for different task attempts.
169   * 
170   * @param taskContext
171   * @throws IOException
172   */
173  public abstract void abortTask(TaskAttemptContext taskContext)
174  throws IOException;
175
176  /**
177   * Is task output recovery supported for restarting jobs?
178   * 
179   * If task output recovery is supported, job restart can be done more
180   * efficiently.
181   * 
182   * @return <code>true</code> if task output recovery is supported,
183   *         <code>false</code> otherwise
184   * @see #recoverTask(TaskAttemptContext)
185   * @deprecated Use {@link #isRecoverySupported(JobContext)} instead.
186   */
187  @Deprecated
188  public boolean isRecoverySupported() {
189    return false;
190  }
191
192  /**
193   * Returns true if an in-progress job commit can be retried. If the MR AM is
194   * re-run then it will check this value to determine if it can retry an
195   * in-progress commit that was started by a previous version.
196   * Note that in rare scenarios, the previous AM version might still be running
197   * at that time, due to system anomalies. Hence if this method returns true
198   * then the retry commit operation should be able to run concurrently with
199   * the previous operation.
200   *
201   * If repeatable job commit is supported, job restart can tolerate previous
202   * AM failures during job commit.
203   *
204   * By default, it is not supported. Extended classes (like:
205   * FileOutputCommitter) should explicitly override it if provide support.
206   *
207   * @param jobContext
208   *          Context of the job whose output is being written.
209   * @return <code>true</code> repeatable job commit is supported,
210   *         <code>false</code> otherwise
211   * @throws IOException
212   */
213  public boolean isCommitJobRepeatable(JobContext jobContext)
214      throws IOException {
215    return false;
216  }
217
218  /**
219   * Is task output recovery supported for restarting jobs?
220   * 
221   * If task output recovery is supported, job restart can be done more
222   * efficiently.
223   * 
224   * @param jobContext
225   *          Context of the job whose output is being written.
226   * @return <code>true</code> if task output recovery is supported,
227   *         <code>false</code> otherwise
228   * @throws IOException
229   * @see #recoverTask(TaskAttemptContext)
230   */
231  public boolean isRecoverySupported(JobContext jobContext) throws IOException {
232    return isRecoverySupported();
233  }
234
235  /**
236   * Recover the task output. 
237   * 
238   * The retry-count for the job will be passed via the 
239   * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in  
240   * {@link TaskAttemptContext#getConfiguration()} for the 
241   * <code>OutputCommitter</code>.  This is called from the application master
242   * process, but it is called individually for each task.
243   * 
244   * If an exception is thrown the task will be attempted again. 
245   * 
246   * This may be called multiple times for the same task.  But from different
247   * application attempts.
248   * 
249   * @param taskContext Context of the task whose output is being recovered
250   * @throws IOException
251   */
252  public void recoverTask(TaskAttemptContext taskContext)
253  throws IOException
254  {}
255}