001 /* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 package org.apache.hadoop.io.compress; 019 020 import java.util.*; 021 022 import org.apache.commons.logging.Log; 023 import org.apache.commons.logging.LogFactory; 024 import org.apache.hadoop.classification.InterfaceAudience; 025 import org.apache.hadoop.classification.InterfaceStability; 026 import org.apache.hadoop.conf.Configuration; 027 import org.apache.hadoop.fs.Path; 028 import org.apache.hadoop.util.ReflectionUtils; 029 030 /** 031 * A factory that will find the correct codec for a given filename. 032 */ 033 @InterfaceAudience.Public 034 @InterfaceStability.Evolving 035 public class CompressionCodecFactory { 036 037 public static final Log LOG = 038 LogFactory.getLog(CompressionCodecFactory.class.getName()); 039 040 private static final ServiceLoader<CompressionCodec> CODEC_PROVIDERS = 041 ServiceLoader.load(CompressionCodec.class); 042 043 /** 044 * A map from the reversed filename suffixes to the codecs. 045 * This is probably overkill, because the maps should be small, but it 046 * automatically supports finding the longest matching suffix. 047 */ 048 private SortedMap<String, CompressionCodec> codecs = null; 049 050 /** 051 * A map from the reversed filename suffixes to the codecs. 052 * This is probably overkill, because the maps should be small, but it 053 * automatically supports finding the longest matching suffix. 054 */ 055 private Map<String, CompressionCodec> codecsByName = null; 056 057 /** 058 * A map from class names to the codecs 059 */ 060 private HashMap<String, CompressionCodec> codecsByClassName = null; 061 062 private void addCodec(CompressionCodec codec) { 063 String suffix = codec.getDefaultExtension(); 064 codecs.put(new StringBuilder(suffix).reverse().toString(), codec); 065 codecsByClassName.put(codec.getClass().getCanonicalName(), codec); 066 067 String codecName = codec.getClass().getSimpleName(); 068 codecsByName.put(codecName.toLowerCase(), codec); 069 if (codecName.endsWith("Codec")) { 070 codecName = codecName.substring(0, codecName.length() - "Codec".length()); 071 codecsByName.put(codecName.toLowerCase(), codec); 072 } 073 } 074 075 /** 076 * Print the extension map out as a string. 077 */ 078 @Override 079 public String toString() { 080 StringBuilder buf = new StringBuilder(); 081 Iterator<Map.Entry<String, CompressionCodec>> itr = 082 codecs.entrySet().iterator(); 083 buf.append("{ "); 084 if (itr.hasNext()) { 085 Map.Entry<String, CompressionCodec> entry = itr.next(); 086 buf.append(entry.getKey()); 087 buf.append(": "); 088 buf.append(entry.getValue().getClass().getName()); 089 while (itr.hasNext()) { 090 entry = itr.next(); 091 buf.append(", "); 092 buf.append(entry.getKey()); 093 buf.append(": "); 094 buf.append(entry.getValue().getClass().getName()); 095 } 096 } 097 buf.append(" }"); 098 return buf.toString(); 099 } 100 101 /** 102 * Get the list of codecs discovered via a Java ServiceLoader, or 103 * listed in the configuration. Codecs specified in configuration come 104 * later in the returned list, and are considered to override those 105 * from the ServiceLoader. 106 * @param conf the configuration to look in 107 * @return a list of the {@link CompressionCodec} classes 108 */ 109 public static List<Class<? extends CompressionCodec>> getCodecClasses(Configuration conf) { 110 List<Class<? extends CompressionCodec>> result 111 = new ArrayList<Class<? extends CompressionCodec>>(); 112 // Add codec classes discovered via service loading 113 synchronized (CODEC_PROVIDERS) { 114 // CODEC_PROVIDERS is a lazy collection. Synchronize so it is 115 // thread-safe. See HADOOP-8406. 116 for (CompressionCodec codec : CODEC_PROVIDERS) { 117 result.add(codec.getClass()); 118 } 119 } 120 // Add codec classes from configuration 121 String codecsString = conf.get("io.compression.codecs"); 122 if (codecsString != null) { 123 StringTokenizer codecSplit = new StringTokenizer(codecsString, ","); 124 while (codecSplit.hasMoreElements()) { 125 String codecSubstring = codecSplit.nextToken().trim(); 126 if (codecSubstring.length() != 0) { 127 try { 128 Class<?> cls = conf.getClassByName(codecSubstring); 129 if (!CompressionCodec.class.isAssignableFrom(cls)) { 130 throw new IllegalArgumentException("Class " + codecSubstring + 131 " is not a CompressionCodec"); 132 } 133 result.add(cls.asSubclass(CompressionCodec.class)); 134 } catch (ClassNotFoundException ex) { 135 throw new IllegalArgumentException("Compression codec " + 136 codecSubstring + " not found.", 137 ex); 138 } 139 } 140 } 141 } 142 return result; 143 } 144 145 /** 146 * Sets a list of codec classes in the configuration. In addition to any 147 * classes specified using this method, {@link CompressionCodec} classes on 148 * the classpath are discovered using a Java ServiceLoader. 149 * @param conf the configuration to modify 150 * @param classes the list of classes to set 151 */ 152 public static void setCodecClasses(Configuration conf, 153 List<Class> classes) { 154 StringBuilder buf = new StringBuilder(); 155 Iterator<Class> itr = classes.iterator(); 156 if (itr.hasNext()) { 157 Class cls = itr.next(); 158 buf.append(cls.getName()); 159 while(itr.hasNext()) { 160 buf.append(','); 161 buf.append(itr.next().getName()); 162 } 163 } 164 conf.set("io.compression.codecs", buf.toString()); 165 } 166 167 /** 168 * Find the codecs specified in the config value io.compression.codecs 169 * and register them. Defaults to gzip and deflate. 170 */ 171 public CompressionCodecFactory(Configuration conf) { 172 codecs = new TreeMap<String, CompressionCodec>(); 173 codecsByClassName = new HashMap<String, CompressionCodec>(); 174 codecsByName = new HashMap<String, CompressionCodec>(); 175 List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf); 176 if (codecClasses == null || codecClasses.isEmpty()) { 177 addCodec(new GzipCodec()); 178 addCodec(new DefaultCodec()); 179 } else { 180 for (Class<? extends CompressionCodec> codecClass : codecClasses) { 181 addCodec(ReflectionUtils.newInstance(codecClass, conf)); 182 } 183 } 184 } 185 186 /** 187 * Find the relevant compression codec for the given file based on its 188 * filename suffix. 189 * @param file the filename to check 190 * @return the codec object 191 */ 192 public CompressionCodec getCodec(Path file) { 193 CompressionCodec result = null; 194 if (codecs != null) { 195 String filename = file.getName(); 196 String reversedFilename = new StringBuilder(filename).reverse().toString(); 197 SortedMap<String, CompressionCodec> subMap = 198 codecs.headMap(reversedFilename); 199 if (!subMap.isEmpty()) { 200 String potentialSuffix = subMap.lastKey(); 201 if (reversedFilename.startsWith(potentialSuffix)) { 202 result = codecs.get(potentialSuffix); 203 } 204 } 205 } 206 return result; 207 } 208 209 /** 210 * Find the relevant compression codec for the codec's canonical class name. 211 * @param classname the canonical class name of the codec 212 * @return the codec object 213 */ 214 public CompressionCodec getCodecByClassName(String classname) { 215 if (codecsByClassName == null) { 216 return null; 217 } 218 return codecsByClassName.get(classname); 219 } 220 221 /** 222 * Find the relevant compression codec for the codec's canonical class name 223 * or by codec alias. 224 * <p/> 225 * Codec aliases are case insensitive. 226 * <p/> 227 * The code alias is the short class name (without the package name). 228 * If the short class name ends with 'Codec', then there are two aliases for 229 * the codec, the complete short class name and the short class name without 230 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the 231 * alias are 'gzip' and 'gzipcodec'. 232 * 233 * @param codecName the canonical class name of the codec 234 * @return the codec object 235 */ 236 public CompressionCodec getCodecByName(String codecName) { 237 if (codecsByClassName == null) { 238 return null; 239 } 240 CompressionCodec codec = getCodecByClassName(codecName); 241 if (codec == null) { 242 // trying to get the codec by name in case the name was specified instead a class 243 codec = codecsByName.get(codecName.toLowerCase()); 244 } 245 return codec; 246 } 247 248 /** 249 * Find the relevant compression codec for the codec's canonical class name 250 * or by codec alias and returns its implemetation class. 251 * <p/> 252 * Codec aliases are case insensitive. 253 * <p/> 254 * The code alias is the short class name (without the package name). 255 * If the short class name ends with 'Codec', then there are two aliases for 256 * the codec, the complete short class name and the short class name without 257 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the 258 * alias are 'gzip' and 'gzipcodec'. 259 * 260 * @param codecName the canonical class name of the codec 261 * @return the codec class 262 */ 263 public Class<? extends CompressionCodec> getCodecClassByName(String codecName) { 264 CompressionCodec codec = getCodecByName(codecName); 265 if (codec == null) { 266 return null; 267 } 268 return codec.getClass(); 269 } 270 271 /** 272 * Removes a suffix from a filename, if it has it. 273 * @param filename the filename to strip 274 * @param suffix the suffix to remove 275 * @return the shortened filename 276 */ 277 public static String removeSuffix(String filename, String suffix) { 278 if (filename.endsWith(suffix)) { 279 return filename.substring(0, filename.length() - suffix.length()); 280 } 281 return filename; 282 } 283 284 /** 285 * A little test program. 286 * @param args 287 */ 288 public static void main(String[] args) throws Exception { 289 Configuration conf = new Configuration(); 290 CompressionCodecFactory factory = new CompressionCodecFactory(conf); 291 boolean encode = false; 292 for(int i=0; i < args.length; ++i) { 293 if ("-in".equals(args[i])) { 294 encode = true; 295 } else if ("-out".equals(args[i])) { 296 encode = false; 297 } else { 298 CompressionCodec codec = factory.getCodec(new Path(args[i])); 299 if (codec == null) { 300 System.out.println("Codec for " + args[i] + " not found."); 301 } else { 302 if (encode) { 303 CompressionOutputStream out = null; 304 java.io.InputStream in = null; 305 try { 306 out = codec.createOutputStream( 307 new java.io.FileOutputStream(args[i])); 308 byte[] buffer = new byte[100]; 309 String inFilename = removeSuffix(args[i], 310 codec.getDefaultExtension()); 311 in = new java.io.FileInputStream(inFilename); 312 int len = in.read(buffer); 313 while (len > 0) { 314 out.write(buffer, 0, len); 315 len = in.read(buffer); 316 } 317 } finally { 318 if(out != null) { out.close(); } 319 if(in != null) { in.close(); } 320 } 321 } else { 322 CompressionInputStream in = null; 323 try { 324 in = codec.createInputStream( 325 new java.io.FileInputStream(args[i])); 326 byte[] buffer = new byte[100]; 327 int len = in.read(buffer); 328 while (len > 0) { 329 System.out.write(buffer, 0, len); 330 len = in.read(buffer); 331 } 332 } finally { 333 if(in != null) { in.close(); } 334 } 335 } 336 } 337 } 338 } 339 } 340 }