001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.mapreduce.lib.aggregate;
020
021import java.util.ArrayList;
022import java.util.Iterator;
023import java.util.Set;
024import java.util.TreeMap;
025
026import org.apache.hadoop.classification.InterfaceAudience;
027import org.apache.hadoop.classification.InterfaceStability;
028
029/**
030 * This class implements a value aggregator that dedupes a sequence of objects.
031 * 
032 */
033@InterfaceAudience.Public
034@InterfaceStability.Stable
035public class UniqValueCount implements ValueAggregator<Object> {
036  public static final String MAX_NUM_UNIQUE_VALUES = 
037    "mapreduce.aggregate.max.num.unique.values";
038
039  private TreeMap<Object, Object> uniqItems = null;
040
041  private long numItems = 0;
042  
043  private long maxNumItems = Long.MAX_VALUE;
044
045  /**
046   * the default constructor
047   * 
048   */
049  public UniqValueCount() {
050    this(Long.MAX_VALUE);
051  }
052  
053  /**
054   * constructor
055   * @param maxNum the limit in the number of unique values to keep.
056   *  
057   */
058  public UniqValueCount(long maxNum) {
059    uniqItems = new TreeMap<Object, Object>();
060    this.numItems = 0;
061    maxNumItems = Long.MAX_VALUE;
062    if (maxNum > 0 ) {
063      this.maxNumItems = maxNum;
064    }
065  }
066
067  /**
068   * Set the limit on the number of unique values
069   * @param n the desired limit on the number of unique values
070   * @return the new limit on the number of unique values
071   */
072  public long setMaxItems(long n) {
073    if (n >= numItems) {
074      this.maxNumItems = n;
075    } else if (this.maxNumItems >= this.numItems) {
076      this.maxNumItems = this.numItems;
077    }
078    return this.maxNumItems;
079  }
080  
081  /**
082   * add a value to the aggregator
083   * 
084   * @param val
085   *          an object.
086   * 
087   */
088  public void addNextValue(Object val) {
089    if (this.numItems <= this.maxNumItems) {
090      uniqItems.put(val.toString(), "1");
091      this.numItems = this.uniqItems.size();
092    }
093  }
094
095  /**
096   * @return return the number of unique objects aggregated
097   */
098  public String getReport() {
099    return "" + uniqItems.size();
100  }
101
102  /**
103   * 
104   * @return the set of the unique objects
105   */
106  public Set<Object> getUniqueItems() {
107    return uniqItems.keySet();
108  }
109
110  /**
111   * reset the aggregator
112   */
113  public void reset() {
114    uniqItems = new TreeMap<Object, Object>();
115  }
116
117  /**
118   * @return return an array of the unique objects. The return value is
119   *         expected to be used by the a combiner.
120   */
121  public ArrayList<Object> getCombinerOutput() {
122    Object key = null;
123    Iterator<Object> iter = uniqItems.keySet().iterator();
124    ArrayList<Object> retv = new ArrayList<Object>();
125
126    while (iter.hasNext()) {
127      key = iter.next();
128      retv.add(key);
129    }
130    return retv;
131  }
132}