Source code

001/*
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.io.compress;
020
021import java.io.IOException;
022import java.io.InputStream;
023
024import org.apache.hadoop.classification.InterfaceAudience;
025import org.apache.hadoop.classification.InterfaceStability;
026
027
028/**
029 * This interface is meant to be implemented by those compression codecs
030 * which are capable to compress / de-compress a stream starting at any
031 * arbitrary position.
032 *
033 * Especially the process of de-compressing a stream starting at some arbitrary
034 * position is challenging.  Most of the codecs are only able to successfully
035 * de-compress a stream, if they start from the very beginning till the end.
036 * One of the reasons is the stored state at the beginning of the stream which
037 * is crucial for de-compression.
038 *
039 * Yet there are few codecs which do not save the whole state at the beginning
040 * of the stream and hence can be used to de-compress stream starting at any
041 * arbitrary points.  This interface is meant to be used by such codecs.  Such
042 * codecs are highly valuable, especially in the context of Hadoop, because
043 * an input compressed file can be split and hence can be worked on by multiple
044 * machines in parallel.
045 */
046@InterfaceAudience.Public
047@InterfaceStability.Evolving
048public interface SplittableCompressionCodec extends CompressionCodec {
049
050  /**
051   * During decompression, data can be read off from the decompressor in two
052   * modes, namely continuous and blocked.  Few codecs (e.g. BZip2) are capable
053   * of compressing data in blocks and then decompressing the blocks.  In
054   * Blocked reading mode codecs inform 'end of block' events to its caller.
055   * While in continuous mode, the caller of codecs is unaware about the blocks
056   * and uncompressed data is spilled out like a continuous stream.
057   */
058  public enum READ_MODE {CONTINUOUS, BYBLOCK};
059
060  /**
061   * Create a stream as dictated by the readMode.  This method is used when
062   * the codecs wants the ability to work with the underlying stream positions.
063   *
064   * @param seekableIn  The seekable input stream (seeks in compressed data)
065   * @param start The start offset into the compressed stream. May be changed
066   *              by the underlying codec.
067   * @param end The end offset into the compressed stream. May be changed by
068   *            the underlying codec.
069   * @param readMode Controls whether stream position is reported continuously
070   *                 from the compressed stream only only at block boundaries.
071   * @return  a stream to read uncompressed bytes from
072   */
073  SplitCompressionInputStream createInputStream(InputStream seekableIn,
074      Decompressor decompressor, long start, long end, READ_MODE readMode)
075      throws IOException;
076
077}