001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapreduce;
020    
021    import java.io.IOException;
022    
023    import org.apache.hadoop.classification.InterfaceAudience;
024    import org.apache.hadoop.classification.InterfaceStability;
025    /**
026     * <code>OutputCommitter</code> describes the commit of task output for a 
027     * Map-Reduce job.
028     *
029     * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of 
030     * the job to:<p>
031     * <ol>
032     *   <li>
033     *   Setup the job during initialization. For example, create the temporary 
034     *   output directory for the job during the initialization of the job.
035     *   </li>
036     *   <li>
037     *   Cleanup the job after the job completion. For example, remove the
038     *   temporary output directory after the job completion. 
039     *   </li>
040     *   <li>
041     *   Setup the task temporary output.
042     *   </li> 
043     *   <li>
044     *   Check whether a task needs a commit. This is to avoid the commit
045     *   procedure if a task does not need commit.
046     *   </li>
047     *   <li>
048     *   Commit of the task output.
049     *   </li>  
050     *   <li>
051     *   Discard the task commit.
052     *   </li>
053     * </ol>
054     * The methods in this class can be called from several different processes and
055     * from several different contexts.  It is important to know which process and
056     * which context each is called from.  Each method should be marked accordingly
057     * in its documentation.  It is also important to note that not all methods are
058     * guaranteed to be called once and only once.  If a method is not guaranteed to
059     * have this property the output committer needs to handle this appropriately. 
060     * Also note it will only be in rare situations where they may be called 
061     * multiple times for the same task.
062     * 
063     * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter 
064     * @see JobContext
065     * @see TaskAttemptContext 
066     */
067    @InterfaceAudience.Public
068    @InterfaceStability.Stable
069    public abstract class OutputCommitter {
070      /**
071       * For the framework to setup the job output during initialization.  This is
072       * called from the application master process for the entire job. This will be
073       * called multiple times, once per job attempt.
074       * 
075       * @param jobContext Context of the job whose output is being written.
076       * @throws IOException if temporary output could not be created
077       */
078      public abstract void setupJob(JobContext jobContext) throws IOException;
079    
080      /**
081       * For cleaning up the job's output after job completion.  This is called
082       * from the application master process for the entire job. This may be called
083       * multiple times.
084       * 
085       * @param jobContext Context of the job whose output is being written.
086       * @throws IOException
087       * @deprecated Use {@link #commitJob(JobContext)} and
088       *                 {@link #abortJob(JobContext, JobStatus.State)} instead.
089       */
090      @Deprecated
091      public void cleanupJob(JobContext jobContext) throws IOException { }
092    
093      /**
094       * For committing job's output after successful job completion. Note that this
095       * is invoked for jobs with final runstate as SUCCESSFUL.  This is called
096       * from the application master process for the entire job. This is guaranteed
097       * to only be called once.  If it throws an exception the entire job will
098       * fail.      
099       * 
100       * @param jobContext Context of the job whose output is being written.
101       * @throws IOException
102       */
103      public void commitJob(JobContext jobContext) throws IOException {
104        cleanupJob(jobContext);
105      }
106    
107      
108      /**
109       * For aborting an unsuccessful job's output. Note that this is invoked for 
110       * jobs with final runstate as {@link JobStatus.State#FAILED} or 
111       * {@link JobStatus.State#KILLED}.  This is called from the application
112       * master process for the entire job. This may be called multiple times.
113       *
114       * @param jobContext Context of the job whose output is being written.
115       * @param state final runstate of the job
116       * @throws IOException
117       */
118      public void abortJob(JobContext jobContext, JobStatus.State state) 
119      throws IOException {
120        cleanupJob(jobContext);
121      }
122      
123      /**
124       * Sets up output for the task.  This is called from each individual task's
125       * process that will output to HDFS, and it is called just for that task. This
126       * may be called multiple times for the same task, but for different task
127       * attempts.
128       * 
129       * @param taskContext Context of the task whose output is being written.
130       * @throws IOException
131       */
132      public abstract void setupTask(TaskAttemptContext taskContext)
133      throws IOException;
134      
135      /**
136       * Check whether task needs a commit.  This is called from each individual
137       * task's process that will output to HDFS, and it is called just for that
138       * task.
139       * 
140       * @param taskContext
141       * @return true/false
142       * @throws IOException
143       */
144      public abstract boolean needsTaskCommit(TaskAttemptContext taskContext)
145      throws IOException;
146    
147      /**
148       * To promote the task's temporary output to final output location.
149       * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this
150       * task is the task that the AM determines finished first, this method
151       * is called to commit an individual task's output.  This is to mark
152       * that tasks output as complete, as {@link #commitJob(JobContext)} will 
153       * also be called later on if the entire job finished successfully. This
154       * is called from a task's process. This may be called multiple times for the
155       * same task, but different task attempts.  It should be very rare for this to
156       * be called multiple times and requires odd networking failures to make this
157       * happen. In the future the Hadoop framework may eliminate this race.
158       * 
159       * @param taskContext Context of the task whose output is being written.
160       * @throws IOException if commit is not successful. 
161       */
162      public abstract void commitTask(TaskAttemptContext taskContext)
163      throws IOException;
164      
165      /**
166       * Discard the task output. This is called from a task's process to clean 
167       * up a single task's output that can not yet been committed. This may be
168       * called multiple times for the same task, but for different task attempts.
169       * 
170       * @param taskContext
171       * @throws IOException
172       */
173      public abstract void abortTask(TaskAttemptContext taskContext)
174      throws IOException;
175    
176      /**
177       * Is task output recovery supported for restarting jobs?
178       * 
179       * If task output recovery is supported, job restart can be done more 
180       * efficiently.
181       * 
182       * @return <code>true</code> if task output recovery is supported,
183       *         <code>false</code> otherwise
184       * @see #recoverTask(TaskAttemptContext)         
185       */
186      public boolean isRecoverySupported() {
187        return false;
188      }
189      
190      /**
191       * Recover the task output. 
192       * 
193       * The retry-count for the job will be passed via the 
194       * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in  
195       * {@link TaskAttemptContext#getConfiguration()} for the 
196       * <code>OutputCommitter</code>.  This is called from the application master
197       * process, but it is called individually for each task.
198       * 
199       * If an exception is thrown the task will be attempted again. 
200       * 
201       * This may be called multiple times for the same task.  But from different
202       * application attempts.
203       * 
204       * @param taskContext Context of the task whose output is being recovered
205       * @throws IOException
206       */
207      public void recoverTask(TaskAttemptContext taskContext)
208      throws IOException
209      {}
210    }