001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.mapreduce.lib.aggregate; 020 021import java.util.ArrayList; 022import java.util.Iterator; 023import java.util.Set; 024import java.util.TreeMap; 025 026import org.apache.hadoop.classification.InterfaceAudience; 027import org.apache.hadoop.classification.InterfaceStability; 028 029/** 030 * This class implements a value aggregator that dedupes a sequence of objects. 031 * 032 */ 033@InterfaceAudience.Public 034@InterfaceStability.Stable 035public class UniqValueCount implements ValueAggregator<Object> { 036 public static final String MAX_NUM_UNIQUE_VALUES = 037 "mapreduce.aggregate.max.num.unique.values"; 038 039 private TreeMap<Object, Object> uniqItems = null; 040 041 private long numItems = 0; 042 043 private long maxNumItems = Long.MAX_VALUE; 044 045 /** 046 * the default constructor 047 * 048 */ 049 public UniqValueCount() { 050 this(Long.MAX_VALUE); 051 } 052 053 /** 054 * constructor 055 * @param maxNum the limit in the number of unique values to keep. 056 * 057 */ 058 public UniqValueCount(long maxNum) { 059 uniqItems = new TreeMap<Object, Object>(); 060 this.numItems = 0; 061 maxNumItems = Long.MAX_VALUE; 062 if (maxNum > 0 ) { 063 this.maxNumItems = maxNum; 064 } 065 } 066 067 /** 068 * Set the limit on the number of unique values 069 * @param n the desired limit on the number of unique values 070 * @return the new limit on the number of unique values 071 */ 072 public long setMaxItems(long n) { 073 if (n >= numItems) { 074 this.maxNumItems = n; 075 } else if (this.maxNumItems >= this.numItems) { 076 this.maxNumItems = this.numItems; 077 } 078 return this.maxNumItems; 079 } 080 081 /** 082 * add a value to the aggregator 083 * 084 * @param val 085 * an object. 086 * 087 */ 088 public void addNextValue(Object val) { 089 if (this.numItems <= this.maxNumItems) { 090 uniqItems.put(val.toString(), "1"); 091 this.numItems = this.uniqItems.size(); 092 } 093 } 094 095 /** 096 * @return return the number of unique objects aggregated 097 */ 098 public String getReport() { 099 return "" + uniqItems.size(); 100 } 101 102 /** 103 * 104 * @return the set of the unique objects 105 */ 106 public Set<Object> getUniqueItems() { 107 return uniqItems.keySet(); 108 } 109 110 /** 111 * reset the aggregator 112 */ 113 public void reset() { 114 uniqItems = new TreeMap<Object, Object>(); 115 } 116 117 /** 118 * @return return an array of the unique objects. The return value is 119 * expected to be used by the a combiner. 120 */ 121 public ArrayList<Object> getCombinerOutput() { 122 Object key = null; 123 Iterator<Object> iter = uniqItems.keySet().iterator(); 124 ArrayList<Object> retv = new ArrayList<Object>(); 125 126 while (iter.hasNext()) { 127 key = iter.next(); 128 retv.add(key); 129 } 130 return retv; 131 } 132}