001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.mapreduce;
020
021 import java.io.IOException;
022
023 import org.apache.hadoop.classification.InterfaceAudience;
024 import org.apache.hadoop.classification.InterfaceStability;
025 /**
026 * <code>OutputCommitter</code> describes the commit of task output for a
027 * Map-Reduce job.
028 *
029 * <p>The Map-Reduce framework relies on the <code>OutputCommitter</code> of
030 * the job to:<p>
031 * <ol>
032 * <li>
033 * Setup the job during initialization. For example, create the temporary
034 * output directory for the job during the initialization of the job.
035 * </li>
036 * <li>
037 * Cleanup the job after the job completion. For example, remove the
038 * temporary output directory after the job completion.
039 * </li>
040 * <li>
041 * Setup the task temporary output.
042 * </li>
043 * <li>
044 * Check whether a task needs a commit. This is to avoid the commit
045 * procedure if a task does not need commit.
046 * </li>
047 * <li>
048 * Commit of the task output.
049 * </li>
050 * <li>
051 * Discard the task commit.
052 * </li>
053 * </ol>
054 * The methods in this class can be called from several different processes and
055 * from several different contexts. It is important to know which process and
056 * which context each is called from. Each method should be marked accordingly
057 * in its documentation. It is also important to note that not all methods are
058 * guaranteed to be called once and only once. If a method is not guaranteed to
059 * have this property the output committer needs to handle this appropriately.
060 * Also note it will only be in rare situations where they may be called
061 * multiple times for the same task.
062 *
063 * @see org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter
064 * @see JobContext
065 * @see TaskAttemptContext
066 */
067 @InterfaceAudience.Public
068 @InterfaceStability.Stable
069 public abstract class OutputCommitter {
070 /**
071 * For the framework to setup the job output during initialization. This is
072 * called from the application master process for the entire job. This will be
073 * called multiple times, once per job attempt.
074 *
075 * @param jobContext Context of the job whose output is being written.
076 * @throws IOException if temporary output could not be created
077 */
078 public abstract void setupJob(JobContext jobContext) throws IOException;
079
080 /**
081 * For cleaning up the job's output after job completion. This is called
082 * from the application master process for the entire job. This may be called
083 * multiple times.
084 *
085 * @param jobContext Context of the job whose output is being written.
086 * @throws IOException
087 * @deprecated Use {@link #commitJob(JobContext)} and
088 * {@link #abortJob(JobContext, JobStatus.State)} instead.
089 */
090 @Deprecated
091 public void cleanupJob(JobContext jobContext) throws IOException { }
092
093 /**
094 * For committing job's output after successful job completion. Note that this
095 * is invoked for jobs with final runstate as SUCCESSFUL. This is called
096 * from the application master process for the entire job. This is guaranteed
097 * to only be called once. If it throws an exception the entire job will
098 * fail.
099 *
100 * @param jobContext Context of the job whose output is being written.
101 * @throws IOException
102 */
103 public void commitJob(JobContext jobContext) throws IOException {
104 cleanupJob(jobContext);
105 }
106
107
108 /**
109 * For aborting an unsuccessful job's output. Note that this is invoked for
110 * jobs with final runstate as {@link JobStatus.State#FAILED} or
111 * {@link JobStatus.State#KILLED}. This is called from the application
112 * master process for the entire job. This may be called multiple times.
113 *
114 * @param jobContext Context of the job whose output is being written.
115 * @param state final runstate of the job
116 * @throws IOException
117 */
118 public void abortJob(JobContext jobContext, JobStatus.State state)
119 throws IOException {
120 cleanupJob(jobContext);
121 }
122
123 /**
124 * Sets up output for the task. This is called from each individual task's
125 * process that will output to HDFS, and it is called just for that task. This
126 * may be called multiple times for the same task, but for different task
127 * attempts.
128 *
129 * @param taskContext Context of the task whose output is being written.
130 * @throws IOException
131 */
132 public abstract void setupTask(TaskAttemptContext taskContext)
133 throws IOException;
134
135 /**
136 * Check whether task needs a commit. This is called from each individual
137 * task's process that will output to HDFS, and it is called just for that
138 * task.
139 *
140 * @param taskContext
141 * @return true/false
142 * @throws IOException
143 */
144 public abstract boolean needsTaskCommit(TaskAttemptContext taskContext)
145 throws IOException;
146
147 /**
148 * To promote the task's temporary output to final output location.
149 * If {@link #needsTaskCommit(TaskAttemptContext)} returns true and this
150 * task is the task that the AM determines finished first, this method
151 * is called to commit an individual task's output. This is to mark
152 * that tasks output as complete, as {@link #commitJob(JobContext)} will
153 * also be called later on if the entire job finished successfully. This
154 * is called from a task's process. This may be called multiple times for the
155 * same task, but different task attempts. It should be very rare for this to
156 * be called multiple times and requires odd networking failures to make this
157 * happen. In the future the Hadoop framework may eliminate this race.
158 *
159 * @param taskContext Context of the task whose output is being written.
160 * @throws IOException if commit is not successful.
161 */
162 public abstract void commitTask(TaskAttemptContext taskContext)
163 throws IOException;
164
165 /**
166 * Discard the task output. This is called from a task's process to clean
167 * up a single task's output that can not yet been committed. This may be
168 * called multiple times for the same task, but for different task attempts.
169 *
170 * @param taskContext
171 * @throws IOException
172 */
173 public abstract void abortTask(TaskAttemptContext taskContext)
174 throws IOException;
175
176 /**
177 * Is task output recovery supported for restarting jobs?
178 *
179 * If task output recovery is supported, job restart can be done more
180 * efficiently.
181 *
182 * @return <code>true</code> if task output recovery is supported,
183 * <code>false</code> otherwise
184 * @see #recoverTask(TaskAttemptContext)
185 * @deprecated Use {@link #isRecoverySupported(JobContext)} instead.
186 */
187 @Deprecated
188 public boolean isRecoverySupported() {
189 return false;
190 }
191
192 /**
193 * Is task output recovery supported for restarting jobs?
194 *
195 * If task output recovery is supported, job restart can be done more
196 * efficiently.
197 *
198 * @param jobContext
199 * Context of the job whose output is being written.
200 * @return <code>true</code> if task output recovery is supported,
201 * <code>false</code> otherwise
202 * @throws IOException
203 * @see #recoverTask(TaskAttemptContext)
204 */
205 public boolean isRecoverySupported(JobContext jobContext) throws IOException {
206 return isRecoverySupported();
207 }
208
209 /**
210 * Recover the task output.
211 *
212 * The retry-count for the job will be passed via the
213 * {@link MRJobConfig#APPLICATION_ATTEMPT_ID} key in
214 * {@link TaskAttemptContext#getConfiguration()} for the
215 * <code>OutputCommitter</code>. This is called from the application master
216 * process, but it is called individually for each task.
217 *
218 * If an exception is thrown the task will be attempted again.
219 *
220 * This may be called multiple times for the same task. But from different
221 * application attempts.
222 *
223 * @param taskContext Context of the task whose output is being recovered
224 * @throws IOException
225 */
226 public void recoverTask(TaskAttemptContext taskContext)
227 throws IOException
228 {}
229 }