001/* 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package org.apache.hadoop.io.compress; 019 020import java.util.*; 021 022import org.apache.commons.logging.Log; 023import org.apache.commons.logging.LogFactory; 024import org.apache.hadoop.classification.InterfaceAudience; 025import org.apache.hadoop.classification.InterfaceStability; 026import org.apache.hadoop.conf.Configuration; 027import org.apache.hadoop.fs.Path; 028import org.apache.hadoop.util.ReflectionUtils; 029 030/** 031 * A factory that will find the correct codec for a given filename. 032 */ 033@InterfaceAudience.Public 034@InterfaceStability.Evolving 035public class CompressionCodecFactory { 036 037 public static final Log LOG = 038 LogFactory.getLog(CompressionCodecFactory.class.getName()); 039 040 /** 041 * A map from the reversed filename suffixes to the codecs. 042 * This is probably overkill, because the maps should be small, but it 043 * automatically supports finding the longest matching suffix. 044 */ 045 private SortedMap<String, CompressionCodec> codecs = null; 046 047 /** 048 * A map from the reversed filename suffixes to the codecs. 049 * This is probably overkill, because the maps should be small, but it 050 * automatically supports finding the longest matching suffix. 051 */ 052 private Map<String, CompressionCodec> codecsByName = null; 053 054 /** 055 * A map from class names to the codecs 056 */ 057 private HashMap<String, CompressionCodec> codecsByClassName = null; 058 059 private void addCodec(CompressionCodec codec) { 060 String suffix = codec.getDefaultExtension(); 061 codecs.put(new StringBuilder(suffix).reverse().toString(), codec); 062 codecsByClassName.put(codec.getClass().getCanonicalName(), codec); 063 064 String codecName = codec.getClass().getSimpleName(); 065 codecsByName.put(codecName.toLowerCase(), codec); 066 if (codecName.endsWith("Codec")) { 067 codecName = codecName.substring(0, codecName.length() - "Codec".length()); 068 codecsByName.put(codecName.toLowerCase(), codec); 069 } 070 } 071 072 /** 073 * Print the extension map out as a string. 074 */ 075 public String toString() { 076 StringBuilder buf = new StringBuilder(); 077 Iterator<Map.Entry<String, CompressionCodec>> itr = 078 codecs.entrySet().iterator(); 079 buf.append("{ "); 080 if (itr.hasNext()) { 081 Map.Entry<String, CompressionCodec> entry = itr.next(); 082 buf.append(entry.getKey()); 083 buf.append(": "); 084 buf.append(entry.getValue().getClass().getName()); 085 while (itr.hasNext()) { 086 entry = itr.next(); 087 buf.append(", "); 088 buf.append(entry.getKey()); 089 buf.append(": "); 090 buf.append(entry.getValue().getClass().getName()); 091 } 092 } 093 buf.append(" }"); 094 return buf.toString(); 095 } 096 097 /** 098 * Get the list of codecs listed in the configuration 099 * @param conf the configuration to look in 100 * @return a list of the Configuration classes or null if the attribute 101 * was not set 102 */ 103 public static List<Class<? extends CompressionCodec>> getCodecClasses(Configuration conf) { 104 String codecsString = conf.get("io.compression.codecs"); 105 if (codecsString != null) { 106 List<Class<? extends CompressionCodec>> result 107 = new ArrayList<Class<? extends CompressionCodec>>(); 108 StringTokenizer codecSplit = new StringTokenizer(codecsString, ","); 109 while (codecSplit.hasMoreElements()) { 110 String codecSubstring = codecSplit.nextToken().trim(); 111 if (codecSubstring.length() != 0) { 112 try { 113 Class<?> cls = conf.getClassByName(codecSubstring); 114 if (!CompressionCodec.class.isAssignableFrom(cls)) { 115 throw new IllegalArgumentException("Class " + codecSubstring + 116 " is not a CompressionCodec"); 117 } 118 result.add(cls.asSubclass(CompressionCodec.class)); 119 } catch (ClassNotFoundException ex) { 120 throw new IllegalArgumentException("Compression codec " + 121 codecSubstring + " not found.", 122 ex); 123 } 124 } 125 } 126 return result; 127 } else { 128 return null; 129 } 130 } 131 132 /** 133 * Sets a list of codec classes in the configuration. 134 * @param conf the configuration to modify 135 * @param classes the list of classes to set 136 */ 137 public static void setCodecClasses(Configuration conf, 138 List<Class> classes) { 139 StringBuilder buf = new StringBuilder(); 140 Iterator<Class> itr = classes.iterator(); 141 if (itr.hasNext()) { 142 Class cls = itr.next(); 143 buf.append(cls.getName()); 144 while(itr.hasNext()) { 145 buf.append(','); 146 buf.append(itr.next().getName()); 147 } 148 } 149 conf.set("io.compression.codecs", buf.toString()); 150 } 151 152 /** 153 * Find the codecs specified in the config value io.compression.codecs 154 * and register them. Defaults to gzip and zip. 155 */ 156 public CompressionCodecFactory(Configuration conf) { 157 codecs = new TreeMap<String, CompressionCodec>(); 158 codecsByClassName = new HashMap<String, CompressionCodec>(); 159 codecsByName = new HashMap<String, CompressionCodec>(); 160 List<Class<? extends CompressionCodec>> codecClasses = getCodecClasses(conf); 161 if (codecClasses == null) { 162 addCodec(new GzipCodec()); 163 addCodec(new DefaultCodec()); 164 } else { 165 Iterator<Class<? extends CompressionCodec>> itr = codecClasses.iterator(); 166 while (itr.hasNext()) { 167 CompressionCodec codec = ReflectionUtils.newInstance(itr.next(), conf); 168 addCodec(codec); 169 } 170 } 171 } 172 173 /** 174 * Find the relevant compression codec for the given file based on its 175 * filename suffix. 176 * @param file the filename to check 177 * @return the codec object 178 */ 179 public CompressionCodec getCodec(Path file) { 180 CompressionCodec result = null; 181 if (codecs != null) { 182 String filename = file.getName(); 183 String reversedFilename = new StringBuilder(filename).reverse().toString(); 184 SortedMap<String, CompressionCodec> subMap = 185 codecs.headMap(reversedFilename); 186 if (!subMap.isEmpty()) { 187 String potentialSuffix = subMap.lastKey(); 188 if (reversedFilename.startsWith(potentialSuffix)) { 189 result = codecs.get(potentialSuffix); 190 } 191 } 192 } 193 return result; 194 } 195 196 /** 197 * Find the relevant compression codec for the codec's canonical class name. 198 * @param classname the canonical class name of the codec 199 * @return the codec object 200 */ 201 public CompressionCodec getCodecByClassName(String classname) { 202 if (codecsByClassName == null) { 203 return null; 204 } 205 return codecsByClassName.get(classname); 206 } 207 208 /** 209 * Find the relevant compression codec for the codec's canonical class name 210 * or by codec alias. 211 * <p/> 212 * Codec aliases are case insensitive. 213 * <p/> 214 * The code alias is the short class name (without the package name). 215 * If the short class name ends with 'Codec', then there are two aliases for 216 * the codec, the complete short class name and the short class name without 217 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the 218 * alias are 'gzip' and 'gzipcodec'. 219 * 220 * @param codecName the canonical class name of the codec 221 * @return the codec object 222 */ 223 public CompressionCodec getCodecByName(String codecName) { 224 if (codecsByClassName == null) { 225 return null; 226 } 227 CompressionCodec codec = getCodecByClassName(codecName); 228 if (codec == null) { 229 // trying to get the codec by name in case the name was specified instead a class 230 codec = codecsByName.get(codecName.toLowerCase()); 231 } 232 return codec; 233 } 234 235 /** 236 * Find the relevant compression codec for the codec's canonical class name 237 * or by codec alias and returns its implemetation class. 238 * <p/> 239 * Codec aliases are case insensitive. 240 * <p/> 241 * The code alias is the short class name (without the package name). 242 * If the short class name ends with 'Codec', then there are two aliases for 243 * the codec, the complete short class name and the short class name without 244 * the 'Codec' ending. For example for the 'GzipCodec' codec class name the 245 * alias are 'gzip' and 'gzipcodec'. 246 * 247 * @param codecName the canonical class name of the codec 248 * @return the codec class 249 */ 250 public Class<? extends CompressionCodec> getCodecClassByName(String codecName) { 251 CompressionCodec codec = getCodecByName(codecName); 252 if (codec == null) { 253 return null; 254 } 255 return codec.getClass(); 256 } 257 258 /** 259 * Removes a suffix from a filename, if it has it. 260 * @param filename the filename to strip 261 * @param suffix the suffix to remove 262 * @return the shortened filename 263 */ 264 public static String removeSuffix(String filename, String suffix) { 265 if (filename.endsWith(suffix)) { 266 return filename.substring(0, filename.length() - suffix.length()); 267 } 268 return filename; 269 } 270 271 /** 272 * A little test program. 273 * @param args 274 */ 275 public static void main(String[] args) throws Exception { 276 Configuration conf = new Configuration(); 277 CompressionCodecFactory factory = new CompressionCodecFactory(conf); 278 boolean encode = false; 279 for(int i=0; i < args.length; ++i) { 280 if ("-in".equals(args[i])) { 281 encode = true; 282 } else if ("-out".equals(args[i])) { 283 encode = false; 284 } else { 285 CompressionCodec codec = factory.getCodec(new Path(args[i])); 286 if (codec == null) { 287 System.out.println("Codec for " + args[i] + " not found."); 288 } else { 289 if (encode) { 290 CompressionOutputStream out = null; 291 java.io.InputStream in = null; 292 try { 293 out = codec.createOutputStream( 294 new java.io.FileOutputStream(args[i])); 295 byte[] buffer = new byte[100]; 296 String inFilename = removeSuffix(args[i], 297 codec.getDefaultExtension()); 298 in = new java.io.FileInputStream(inFilename); 299 int len = in.read(buffer); 300 while (len > 0) { 301 out.write(buffer, 0, len); 302 len = in.read(buffer); 303 } 304 } finally { 305 if(out != null) { out.close(); } 306 if(in != null) { in.close(); } 307 } 308 } else { 309 CompressionInputStream in = null; 310 try { 311 in = codec.createInputStream( 312 new java.io.FileInputStream(args[i])); 313 byte[] buffer = new byte[100]; 314 int len = in.read(buffer); 315 while (len > 0) { 316 System.out.write(buffer, 0, len); 317 len = in.read(buffer); 318 } 319 } finally { 320 if(in != null) { in.close(); } 321 } 322 } 323 } 324 } 325 } 326 } 327}