Source code

001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.mapreduce;
020
021import java.io.IOException;
022
023import org.apache.hadoop.classification.InterfaceAudience;
024import org.apache.hadoop.classification.InterfaceStability;
025/**
026 * <code>OutputCommitter</code> describes the commit of task output for a 
027 * Map-Reduce job.
028 *
029 * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of 
030 * the job to:<p>
031 * <ol>
032 *   <li>
033 *   Setup the job during initialization. For example, create the temporary 
034 *   output directory for the job during the initialization of the job.
035 *   </li>
036 *   <li>
037 *   Cleanup the job after the job completion. For example, remove the
038 *   temporary output directory after the job completion. 
039 *   </li>
040 *   <li>
041 *   Setup the task temporary output.
042 *   </li> 
043 *   <li>
044 *   Check whether a task needs a commit. This is to avoid the commit
045 *   procedure if a task does not need commit.
046 *   </li>
047 *   <li>
048 *   Commit of the task output.
049 *   </li>  
050 *   <li>
051 *   Discard the task commit.
052 *   </li>
053 * </ol>
054 * The methods in this class can be called from several different processes and
055 * from several different contexts.  It is important to know which process and
056 * which context each is called from.  Each method should be marked accordingly
057 * in its documentation.  It is also important to note that not all methods are
058 * guaranteed to be called once and only once.  If a method is not guaranteed to
059 * have this property the output committer needs to handle this appropriately. 
060 * Also note it will only be in rare situations where they may be called 
061 * multiple times for the same task.
062 * 
063 * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 
064 * @see JobContext
065 * @see TaskAttemptContext 
066 */
067@InterfaceAudience.Public
068@InterfaceStability.Stable
069public abstract class OutputCommitter {
070  /**
071   * For the framework to setup the job output during initialization.  This is
072   * called from the application master process for the entire job. This will be
073   * called multiple times, once per job attempt.
074   * 
075   * @param jobContext Context of the job whose output is being written.
076   * @throws IOException if temporary output could not be created
077   */
078  public abstract void setupJob(JobContext jobContext) throws IOException;
079
080  /**
081   * For cleaning up the job's output after job completion.  This is called
082   * from the application master process for the entire job. This may be called
083   * multiple times.
084   * 
085   * @param jobContext Context of the job whose output is being written.
086   * @throws IOException
087   * @deprecated Use {@link #commitJob(JobContext)} and
088   *                 {@link #abortJob(JobContext, JobStatus.State)} instead.
089   */
090  @Deprecated
091  public void cleanupJob(JobContext jobContext) throws IOException { }
092
093  /**
094   * For committing job's output after successful job completion. Note that this
095   * is invoked for jobs with final runstate as SUCCESSFUL.  This is called
096   * from the application master process for the entire job. This is guaranteed
097   * to only be called once.  If it throws an exception the entire job will
098   * fail.      
099   * 
100   * @param jobContext Context of the job whose output is being written.
101   * @throws IOException
102   */
103  public void commitJob(JobContext jobContext) throws IOException {
104    cleanupJob(jobContext);
105  }
106
107  
108  /**
109   * For aborting an unsuccessful job's output. Note that this is invoked for 
110   * jobs with final runstate as {@link JobStatus.State#FAILED} or 
111   * {@link JobStatus.State#KILLED}.  This is called from the application
112   * master process for the entire job. This may be called multiple times.
113   *
114   * @param jobContext Context of the job whose output is being written.
115   * @param state final runstate of the job
116   * @throws IOException
117   */
118  public void abortJob(JobContext jobContext, JobStatus.State state) 
119  throws IOException {
120    cleanupJob(jobContext);
121  }
122  
123  /**
124   * Sets up output for the task.  This is called from each individual task's
125   * process that will output to HDFS, and it is called just for that task. This
126   * may be called multiple times for the same task, but for different task
127   * attempts.
128   * 
129   * @param taskContext Context of the task whose output is being written.
130   * @throws IOException
131   */
132  public abstract void setupTask(TaskAttemptContext taskContext)
133  throws IOException;
134  
135  /**
136   * Check whether task needs a commit.  This is called from each individual
137   * task's process that will output to HDFS, and it is called just for that
138   * task.
139   * 
140   * @param taskContext
141   * @return true/false
142   * @throws IOException
143   */
144  public abstract boolean needsTaskCommit(TaskAttemptContext taskContext)
145  throws IOException;
146
147  /**
148   * To promote the task's temporary output to final output location.
149   * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this
150   * task is the task that the AM determines finished first, this method
151   * is called to commit an individual task's output.  This is to mark
152   * that tasks output as complete, as {@link #commitJob(JobContext)} will 
153   * also be called later on if the entire job finished successfully. This
154   * is called from a task's process. This may be called multiple times for the
155   * same task, but different task attempts.  It should be very rare for this to
156   * be called multiple times and requires odd networking failures to make this
157   * happen. In the future the Hadoop framework may eliminate this race.
158   * 
159   * @param taskContext Context of the task whose output is being written.
160   * @throws IOException if commit is not successful. 
161   */
162  public abstract void commitTask(TaskAttemptContext taskContext)
163  throws IOException;
164  
165  /**
166   * Discard the task output. This is called from a task's process to clean 
167   * up a single task's output that can not yet been committed. This may be
168   * called multiple times for the same task, but for different task attempts.
169   * 
170   * @param taskContext
171   * @throws IOException
172   */
173  public abstract void abortTask(TaskAttemptContext taskContext)
174  throws IOException;
175
176  /**
177   * Is task output recovery supported for restarting jobs?
178   * 
179   * If task output recovery is supported, job restart can be done more
180   * efficiently.
181   * 
182   * @return <code>true</code> if task output recovery is supported,
183   *         <code>false</code> otherwise
184   * @see #recoverTask(TaskAttemptContext)
185   * @deprecated Use {@link #isRecoverySupported(JobContext)} instead.
186   */
187  @Deprecated
188  public boolean isRecoverySupported() {
189    return false;
190  }
191
192  /**
193   * Is task output recovery supported for restarting jobs?
194   * 
195   * If task output recovery is supported, job restart can be done more
196   * efficiently.
197   * 
198   * @param jobContext
199   *          Context of the job whose output is being written.
200   * @return <code>true</code> if task output recovery is supported,
201   *         <code>false</code> otherwise
202   * @throws IOException
203   * @see #recoverTask(TaskAttemptContext)
204   */
205  public boolean isRecoverySupported(JobContext jobContext) throws IOException {
206    return isRecoverySupported();
207  }
208
209  /**
210   * Recover the task output. 
211   * 
212   * The retry-count for the job will be passed via the 
213   * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in  
214   * {@link TaskAttemptContext#getConfiguration()} for the 
215   * <code>OutputCommitter</code>.  This is called from the application master
216   * process, but it is called individually for each task.
217   * 
218   * If an exception is thrown the task will be attempted again. 
219   * 
220   * This may be called multiple times for the same task.  But from different
221   * application attempts.
222   * 
223   * @param taskContext Context of the task whose output is being recovered
224   * @throws IOException
225   */
226  public void recoverTask(TaskAttemptContext taskContext)
227  throws IOException
228  {}
229}