001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.mapred.join;
020    
021    import java.io.IOException;
022    import java.util.ArrayList;
023    import java.util.Comparator;
024    import java.util.PriorityQueue;
025    
026    import org.apache.hadoop.classification.InterfaceAudience;
027    import org.apache.hadoop.classification.InterfaceStability;
028    import org.apache.hadoop.conf.Configurable;
029    import org.apache.hadoop.conf.Configuration;
030    import org.apache.hadoop.io.Writable;
031    import org.apache.hadoop.io.WritableComparable;
032    import org.apache.hadoop.io.WritableComparator;
033    import org.apache.hadoop.io.WritableUtils;
034    import org.apache.hadoop.mapred.RecordReader;
035    import org.apache.hadoop.util.ReflectionUtils;
036    
037    /**
038     * A RecordReader that can effect joins of RecordReaders sharing a common key
039     * type and partitioning.
040     */
041    @InterfaceAudience.Public
042    @InterfaceStability.Stable
043    public abstract class CompositeRecordReader<
044        K extends WritableComparable, // key type
045        V extends Writable,           // accepts RecordReader<K,V> as children
046        X extends Writable>           // emits Writables of this type
047        implements Configurable {
048    
049    
050      private int id;
051      private Configuration conf;
052      private final ResetableIterator<X> EMPTY = new ResetableIterator.EMPTY<X>();
053    
054      private WritableComparator cmp;
055      private Class<? extends WritableComparable> keyclass;
056      private PriorityQueue<ComposableRecordReader<K,?>> q;
057    
058      protected final JoinCollector jc;
059      protected final ComposableRecordReader<K,? extends V>[] kids;
060    
061      protected abstract boolean combine(Object[] srcs, TupleWritable value);
062    
063      /**
064       * Create a RecordReader with <tt>capacity</tt> children to position
065       * <tt>id</tt> in the parent reader.
066       * The id of a root CompositeRecordReader is -1 by convention, but relying
067       * on this is not recommended.
068       */
069      @SuppressWarnings("unchecked") // Generic array assignment
070      public CompositeRecordReader(int id, int capacity,
071          Class<? extends WritableComparator> cmpcl)
072          throws IOException {
073        assert capacity > 0 : "Invalid capacity";
074        this.id = id;
075        if (null != cmpcl) {
076          cmp = ReflectionUtils.newInstance(cmpcl, null);
077          q = new PriorityQueue<ComposableRecordReader<K,?>>(3,
078              new Comparator<ComposableRecordReader<K,?>>() {
079                public int compare(ComposableRecordReader<K,?> o1,
080                                   ComposableRecordReader<K,?> o2) {
081                  return cmp.compare(o1.key(), o2.key());
082                }
083              });
084        }
085        jc = new JoinCollector(capacity);
086        kids = new ComposableRecordReader[capacity];
087      }
088    
089      /**
090       * Return the position in the collector this class occupies.
091       */
092      public int id() {
093        return id;
094      }
095    
096      /**
097       * {@inheritDoc}
098       */
099      public void setConf(Configuration conf) {
100        this.conf = conf;
101      }
102    
103      /**
104       * {@inheritDoc}
105       */
106      public Configuration getConf() {
107        return conf;
108      }
109    
110      /**
111       * Return sorted list of RecordReaders for this composite.
112       */
113      protected PriorityQueue<ComposableRecordReader<K,?>> getRecordReaderQueue() {
114        return q;
115      }
116    
117      /**
118       * Return comparator defining the ordering for RecordReaders in this
119       * composite.
120       */
121      protected WritableComparator getComparator() {
122        return cmp;
123      }
124    
125      /**
126       * Add a RecordReader to this collection.
127       * The id() of a RecordReader determines where in the Tuple its
128       * entry will appear. Adding RecordReaders with the same id has
129       * undefined behavior.
130       */
131      public void add(ComposableRecordReader<K,? extends V> rr) throws IOException {
132        kids[rr.id()] = rr;
133        if (null == q) {
134          cmp = WritableComparator.get(rr.createKey().getClass(), conf);
135          q = new PriorityQueue<ComposableRecordReader<K,?>>(3,
136              new Comparator<ComposableRecordReader<K,?>>() {
137                public int compare(ComposableRecordReader<K,?> o1,
138                                   ComposableRecordReader<K,?> o2) {
139                  return cmp.compare(o1.key(), o2.key());
140                }
141              });
142        }
143        if (rr.hasNext()) {
144          q.add(rr);
145        }
146      }
147    
148      /**
149       * Collector for join values.
150       * This accumulates values for a given key from the child RecordReaders. If
151       * one or more child RR contain duplicate keys, this will emit the cross
152       * product of the associated values until exhausted.
153       */
154      class JoinCollector {
155        private K key;
156        private ResetableIterator<X>[] iters;
157        private int pos = -1;
158        private boolean first = true;
159    
160        /**
161         * Construct a collector capable of handling the specified number of
162         * children.
163         */
164        @SuppressWarnings("unchecked") // Generic array assignment
165        public JoinCollector(int card) {
166          iters = new ResetableIterator[card];
167          for (int i = 0; i < iters.length; ++i) {
168            iters[i] = EMPTY;
169          }
170        }
171    
172        /**
173         * Register a given iterator at position id.
174         */
175        public void add(int id, ResetableIterator<X> i)
176            throws IOException {
177          iters[id] = i;
178        }
179    
180        /**
181         * Return the key associated with this collection.
182         */
183        public K key() {
184          return key;
185        }
186    
187        /**
188         * Codify the contents of the collector to be iterated over.
189         * When this is called, all RecordReaders registered for this
190         * key should have added ResetableIterators.
191         */
192        public void reset(K key) {
193          this.key = key;
194          first = true;
195          pos = iters.length - 1;
196          for (int i = 0; i < iters.length; ++i) {
197            iters[i].reset();
198          }
199        }
200    
201        /**
202         * Clear all state information.
203         */
204        public void clear() {
205          key = null;
206          pos = -1;
207          for (int i = 0; i < iters.length; ++i) {
208            iters[i].clear();
209            iters[i] = EMPTY;
210          }
211        }
212    
213        /**
214         * Returns false if exhausted or if reset(K) has not been called.
215         */
216        protected boolean hasNext() {
217          return !(pos < 0);
218        }
219    
220        /**
221         * Populate Tuple from iterators.
222         * It should be the case that, given iterators i_1...i_n over values from
223         * sources s_1...s_n sharing key k, repeated calls to next should yield
224         * I x I.
225         */
226        @SuppressWarnings("unchecked") // No static typeinfo on Tuples
227        protected boolean next(TupleWritable val) throws IOException {
228          if (first) {
229            int i = -1;
230            for (pos = 0; pos < iters.length; ++pos) {
231              if (iters[pos].hasNext() && iters[pos].next((X)val.get(pos))) {
232                i = pos;
233                val.setWritten(i);
234              }
235            }
236            pos = i;
237            first = false;
238            if (pos < 0) {
239              clear();
240              return false;
241            }
242            return true;
243          }
244          while (0 <= pos && !(iters[pos].hasNext() &&
245                               iters[pos].next((X)val.get(pos)))) {
246            --pos;
247          }
248          if (pos < 0) {
249            clear();
250            return false;
251          }
252          val.setWritten(pos);
253          for (int i = 0; i < pos; ++i) {
254            if (iters[i].replay((X)val.get(i))) {
255              val.setWritten(i);
256            }
257          }
258          while (pos + 1 < iters.length) {
259            ++pos;
260            iters[pos].reset();
261            if (iters[pos].hasNext() && iters[pos].next((X)val.get(pos))) {
262              val.setWritten(pos);
263            }
264          }
265          return true;
266        }
267    
268        /**
269         * Replay the last Tuple emitted.
270         */
271        @SuppressWarnings("unchecked") // No static typeinfo on Tuples
272        public boolean replay(TupleWritable val) throws IOException {
273          // The last emitted tuple might have drawn on an empty source;
274          // it can't be cleared prematurely, b/c there may be more duplicate
275          // keys in iterator positions < pos
276          assert !first;
277          boolean ret = false;
278          for (int i = 0; i < iters.length; ++i) {
279            if (iters[i].replay((X)val.get(i))) {
280              val.setWritten(i);
281              ret = true;
282            }
283          }
284          return ret;
285        }
286    
287        /**
288         * Close all child iterators.
289         */
290        public void close() throws IOException {
291          for (int i = 0; i < iters.length; ++i) {
292            iters[i].close();
293          }
294        }
295    
296        /**
297         * Write the next value into key, value as accepted by the operation
298         * associated with this set of RecordReaders.
299         */
300        public boolean flush(TupleWritable value) throws IOException {
301          while (hasNext()) {
302            value.clearWritten();
303            if (next(value) && combine(kids, value)) {
304              return true;
305            }
306          }
307          return false;
308        }
309      }
310    
311      /**
312       * Return the key for the current join or the value at the top of the
313       * RecordReader heap.
314       */
315      public K key() {
316        if (jc.hasNext()) {
317          return jc.key();
318        }
319        if (!q.isEmpty()) {
320          return q.peek().key();
321        }
322        return null;
323      }
324    
325      /**
326       * Clone the key at the top of this RR into the given object.
327       */
328      public void key(K key) throws IOException {
329        WritableUtils.cloneInto(key, key());
330      }
331    
332      /**
333       * Return true if it is possible that this could emit more values.
334       */
335      public boolean hasNext() {
336        return jc.hasNext() || !q.isEmpty();
337      }
338    
339      /**
340       * Pass skip key to child RRs.
341       */
342      public void skip(K key) throws IOException {
343        ArrayList<ComposableRecordReader<K,?>> tmp =
344          new ArrayList<ComposableRecordReader<K,?>>();
345        while (!q.isEmpty() && cmp.compare(q.peek().key(), key) <= 0) {
346          tmp.add(q.poll());
347        }
348        for (ComposableRecordReader<K,?> rr : tmp) {
349          rr.skip(key);
350          if (rr.hasNext()) {
351            q.add(rr);
352          }
353        }
354      }
355    
356      /**
357       * Obtain an iterator over the child RRs apropos of the value type
358       * ultimately emitted from this join.
359       */
360      protected abstract ResetableIterator<X> getDelegate();
361    
362      /**
363       * If key provided matches that of this Composite, give JoinCollector
364       * iterator over values it may emit.
365       */
366      @SuppressWarnings("unchecked") // No values from static EMPTY class
367      public void accept(CompositeRecordReader.JoinCollector jc, K key)
368          throws IOException {
369        if (hasNext() && 0 == cmp.compare(key, key())) {
370          fillJoinCollector(createKey());
371          jc.add(id, getDelegate());
372          return;
373        }
374        jc.add(id, EMPTY);
375      }
376    
377      /**
378       * For all child RRs offering the key provided, obtain an iterator
379       * at that position in the JoinCollector.
380       */
381      protected void fillJoinCollector(K iterkey) throws IOException {
382        if (!q.isEmpty()) {
383          q.peek().key(iterkey);
384          while (0 == cmp.compare(q.peek().key(), iterkey)) {
385            ComposableRecordReader<K,?> t = q.poll();
386            t.accept(jc, iterkey);
387            if (t.hasNext()) {
388              q.add(t);
389            } else if (q.isEmpty()) {
390              return;
391            }
392          }
393        }
394      }
395    
396      /**
397       * Implement Comparable contract (compare key of join or head of heap
398       * with that of another).
399       */
400      public int compareTo(ComposableRecordReader<K,?> other) {
401        return cmp.compare(key(), other.key());
402      }
403    
404      /**
405       * Create a new key value common to all child RRs.
406       * @throws ClassCastException if key classes differ.
407       */
408      @SuppressWarnings("unchecked") // Explicit check for key class agreement
409      public K createKey() {
410        if (null == keyclass) {
411          final Class<?> cls = kids[0].createKey().getClass();
412          for (RecordReader<K,? extends Writable> rr : kids) {
413            if (!cls.equals(rr.createKey().getClass())) {
414              throw new ClassCastException("Child key classes fail to agree");
415            }
416          }
417          keyclass = cls.asSubclass(WritableComparable.class);
418        }
419        return (K) ReflectionUtils.newInstance(keyclass, getConf());
420      }
421    
422      /**
423       * Create a value to be used internally for joins.
424       */
425      protected TupleWritable createInternalValue() {
426        Writable[] vals = new Writable[kids.length];
427        for (int i = 0; i < vals.length; ++i) {
428          vals[i] = kids[i].createValue();
429        }
430        return new TupleWritable(vals);
431      }
432    
433      /**
434       * Unsupported (returns zero in all cases).
435       */
436      public long getPos() throws IOException {
437        return 0;
438      }
439    
440      /**
441       * Close all child RRs.
442       */
443      public void close() throws IOException {
444        if (kids != null) {
445          for (RecordReader<K,? extends Writable> rr : kids) {
446            rr.close();
447          }
448        }
449        if (jc != null) {
450          jc.close();
451        }
452      }
453    
454      /**
455       * Report progress as the minimum of all child RR progress.
456       */
457      public float getProgress() throws IOException {
458        float ret = 1.0f;
459        for (RecordReader<K,? extends Writable> rr : kids) {
460          ret = Math.min(ret, rr.getProgress());
461        }
462        return ret;
463      }
464    }