001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.mapreduce.lib.join; 020 021 import java.io.IOException; 022 import java.util.ArrayList; 023 import java.util.List; 024 import java.util.Map; 025 import java.util.regex.Matcher; 026 import java.util.regex.Pattern; 027 028 import org.apache.hadoop.classification.InterfaceAudience; 029 import org.apache.hadoop.classification.InterfaceStability; 030 import org.apache.hadoop.conf.Configuration; 031 import org.apache.hadoop.fs.Path; 032 import org.apache.hadoop.io.WritableComparable; 033 import org.apache.hadoop.mapreduce.InputFormat; 034 import org.apache.hadoop.mapreduce.InputSplit; 035 import org.apache.hadoop.mapreduce.JobContext; 036 import org.apache.hadoop.mapreduce.RecordReader; 037 import org.apache.hadoop.mapreduce.TaskAttemptContext; 038 039 /** 040 * An InputFormat capable of performing joins over a set of data sources sorted 041 * and partitioned the same way. 042 * @see #setFormat 043 * 044 * A user may define new join types by setting the property 045 * <tt>mapreduce.join.define.<ident></tt> to a classname. 046 * In the expression <tt>mapreduce.join.expr</tt>, the identifier will be 047 * assumed to be a ComposableRecordReader. 048 * <tt>mapreduce.join.keycomparator</tt> can be a classname used to compare 049 * keys in the join. 050 * @see JoinRecordReader 051 * @see MultiFilterRecordReader 052 */ 053 @SuppressWarnings("unchecked") 054 @InterfaceAudience.Public 055 @InterfaceStability.Stable 056 public class CompositeInputFormat<K extends WritableComparable> 057 extends InputFormat<K, TupleWritable> { 058 059 public static final String JOIN_EXPR = "mapreduce.join.expr"; 060 public static final String JOIN_COMPARATOR = "mapreduce.join.keycomparator"; 061 062 // expression parse tree to which IF requests are proxied 063 private Parser.Node root; 064 065 public CompositeInputFormat() { } 066 067 068 /** 069 * Interpret a given string as a composite expression. 070 * {@code 071 * func ::= <ident>([<func>,]*<func>) 072 * func ::= tbl(<class>,"<path>") 073 * class ::= @see java.lang.Class#forName(java.lang.String) 074 * path ::= @see org.apache.hadoop.fs.Path#Path(java.lang.String) 075 * } 076 * Reads expression from the <tt>mapreduce.join.expr</tt> property and 077 * user-supplied join types from <tt>mapreduce.join.define.<ident></tt> 078 * types. Paths supplied to <tt>tbl</tt> are given as input paths to the 079 * InputFormat class listed. 080 * @see #compose(java.lang.String, java.lang.Class, java.lang.String...) 081 */ 082 public void setFormat(Configuration conf) throws IOException { 083 addDefaults(); 084 addUserIdentifiers(conf); 085 root = Parser.parse(conf.get(JOIN_EXPR, null), conf); 086 } 087 088 /** 089 * Adds the default set of identifiers to the parser. 090 */ 091 protected void addDefaults() { 092 try { 093 Parser.CNode.addIdentifier("inner", InnerJoinRecordReader.class); 094 Parser.CNode.addIdentifier("outer", OuterJoinRecordReader.class); 095 Parser.CNode.addIdentifier("override", OverrideRecordReader.class); 096 Parser.WNode.addIdentifier("tbl", WrappedRecordReader.class); 097 } catch (NoSuchMethodException e) { 098 throw new RuntimeException("FATAL: Failed to init defaults", e); 099 } 100 } 101 102 /** 103 * Inform the parser of user-defined types. 104 */ 105 private void addUserIdentifiers(Configuration conf) throws IOException { 106 Pattern x = Pattern.compile("^mapreduce\\.join\\.define\\.(\\w+)$"); 107 for (Map.Entry<String,String> kv : conf) { 108 Matcher m = x.matcher(kv.getKey()); 109 if (m.matches()) { 110 try { 111 Parser.CNode.addIdentifier(m.group(1), 112 conf.getClass(m.group(0), null, ComposableRecordReader.class)); 113 } catch (NoSuchMethodException e) { 114 throw new IOException("Invalid define for " + m.group(1), e); 115 } 116 } 117 } 118 } 119 120 /** 121 * Build a CompositeInputSplit from the child InputFormats by assigning the 122 * ith split from each child to the ith composite split. 123 */ 124 @SuppressWarnings("unchecked") 125 public List<InputSplit> getSplits(JobContext job) 126 throws IOException, InterruptedException { 127 setFormat(job.getConfiguration()); 128 job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.minsize", Long.MAX_VALUE); 129 return root.getSplits(job); 130 } 131 132 /** 133 * Construct a CompositeRecordReader for the children of this InputFormat 134 * as defined in the init expression. 135 * The outermost join need only be composable, not necessarily a composite. 136 * Mandating TupleWritable isn't strictly correct. 137 */ 138 @SuppressWarnings("unchecked") // child types unknown 139 public RecordReader<K,TupleWritable> createRecordReader(InputSplit split, 140 TaskAttemptContext taskContext) 141 throws IOException, InterruptedException { 142 setFormat(taskContext.getConfiguration()); 143 return root.createRecordReader(split, taskContext); 144 } 145 146 /** 147 * Convenience method for constructing composite formats. 148 * Given InputFormat class (inf), path (p) return: 149 * {@code tbl(<inf>, <p>) } 150 */ 151 public static String compose(Class<? extends InputFormat> inf, 152 String path) { 153 return compose(inf.getName().intern(), path, 154 new StringBuffer()).toString(); 155 } 156 157 /** 158 * Convenience method for constructing composite formats. 159 * Given operation (op), Object class (inf), set of paths (p) return: 160 * {@code <op>(tbl(<inf>,<p1>),tbl(<inf>,<p2>),...,tbl(<inf>,<pn>)) } 161 */ 162 public static String compose(String op, 163 Class<? extends InputFormat> inf, String... path) { 164 final String infname = inf.getName(); 165 StringBuffer ret = new StringBuffer(op + '('); 166 for (String p : path) { 167 compose(infname, p, ret); 168 ret.append(','); 169 } 170 ret.setCharAt(ret.length() - 1, ')'); 171 return ret.toString(); 172 } 173 174 /** 175 * Convenience method for constructing composite formats. 176 * Given operation (op), Object class (inf), set of paths (p) return: 177 * {@code <op>(tbl(<inf>,<p1>),tbl(<inf>,<p2>),...,tbl(<inf>,<pn>)) } 178 */ 179 public static String compose(String op, 180 Class<? extends InputFormat> inf, Path... path) { 181 ArrayList<String> tmp = new ArrayList<String>(path.length); 182 for (Path p : path) { 183 tmp.add(p.toString()); 184 } 185 return compose(op, inf, tmp.toArray(new String[0])); 186 } 187 188 private static StringBuffer compose(String inf, String path, 189 StringBuffer sb) { 190 sb.append("tbl(" + inf + ",\""); 191 sb.append(path); 192 sb.append("\")"); 193 return sb; 194 } 195 }