001/** 002 * 003 * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) 004 * All rights reserved. 005 * Redistribution and use in source and binary forms, with or 006 * without modification, are permitted provided that the following 007 * conditions are met: 008 * - Redistributions of source code must retain the above copyright 009 * notice, this list of conditions and the following disclaimer. 010 * - Redistributions in binary form must reproduce the above copyright 011 * notice, this list of conditions and the following disclaimer in 012 * the documentation and/or other materials provided with the distribution. 013 * - Neither the name of the University Catholique de Louvain - UCL 014 * nor the names of its contributors may be used to endorse or 015 * promote products derived from this software without specific prior 016 * written permission. 017 * 018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 021 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 022 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 023 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 024 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 026 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 027 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 028 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 029 * POSSIBILITY OF SUCH DAMAGE. 030 */ 031 032/** 033 * Licensed to the Apache Software Foundation (ASF) under one 034 * or more contributor license agreements. See the NOTICE file 035 * distributed with this work for additional information 036 * regarding copyright ownership. The ASF licenses this file 037 * to you under the Apache License, Version 2.0 (the 038 * "License"); you may not use this file except in compliance 039 * with the License. You may obtain a copy of the License at 040 * 041 * http://www.apache.org/licenses/LICENSE-2.0 042 * 043 * Unless required by applicable law or agreed to in writing, software 044 * distributed under the License is distributed on an "AS IS" BASIS, 045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 046 * See the License for the specific language governing permissions and 047 * limitations under the License. 048 */ 049 050package org.apache.hadoop.util.bloom; 051 052import java.io.DataInput; 053import java.io.DataOutput; 054import java.io.IOException; 055 056import org.apache.hadoop.classification.InterfaceAudience; 057import org.apache.hadoop.classification.InterfaceStability; 058 059/** 060 * Implements a <i>counting Bloom filter</i>, as defined by Fan et al. in a ToN 061 * 2000 paper. 062 * <p> 063 * A counting Bloom filter is an improvement to standard a Bloom filter as it 064 * allows dynamic additions and deletions of set membership information. This 065 * is achieved through the use of a counting vector instead of a bit vector. 066 * <p> 067 * Originally created by 068 * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>. 069 * 070 * @see Filter The general behavior of a filter 071 * 072 * @see <a href="http://portal.acm.org/citation.cfm?id=343571.343572">Summary cache: a scalable wide-area web cache sharing protocol</a> 073 */ 074@InterfaceAudience.Public 075@InterfaceStability.Stable 076public final class CountingBloomFilter extends Filter { 077 /** Storage for the counting buckets */ 078 private long[] buckets; 079 080 /** We are using 4bit buckets, so each bucket can count to 15 */ 081 private final static long BUCKET_MAX_VALUE = 15; 082 083 /** Default constructor - use with readFields */ 084 public CountingBloomFilter() {} 085 086 /** 087 * Constructor 088 * @param vectorSize The vector size of <i>this</i> filter. 089 * @param nbHash The number of hash function to consider. 090 * @param hashType type of the hashing function (see 091 * {@link org.apache.hadoop.util.hash.Hash}). 092 */ 093 public CountingBloomFilter(int vectorSize, int nbHash, int hashType) { 094 super(vectorSize, nbHash, hashType); 095 buckets = new long[buckets2words(vectorSize)]; 096 } 097 098 /** returns the number of 64 bit words it would take to hold vectorSize buckets */ 099 private static int buckets2words(int vectorSize) { 100 return ((vectorSize - 1) >>> 4) + 1; 101 } 102 103 104 @Override 105 public void add(Key key) { 106 if(key == null) { 107 throw new NullPointerException("key can not be null"); 108 } 109 110 int[] h = hash.hash(key); 111 hash.clear(); 112 113 for(int i = 0; i < nbHash; i++) { 114 // find the bucket 115 int wordNum = h[i] >> 4; // div 16 116 int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 117 118 long bucketMask = 15L << bucketShift; 119 long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; 120 121 // only increment if the count in the bucket is less than BUCKET_MAX_VALUE 122 if(bucketValue < BUCKET_MAX_VALUE) { 123 // increment by 1 124 buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue + 1) << bucketShift); 125 } 126 } 127 } 128 129 /** 130 * Removes a specified key from <i>this</i> counting Bloom filter. 131 * <p> 132 * <b>Invariant</b>: nothing happens if the specified key does not belong to <i>this</i> counter Bloom filter. 133 * @param key The key to remove. 134 */ 135 public void delete(Key key) { 136 if(key == null) { 137 throw new NullPointerException("Key may not be null"); 138 } 139 if(!membershipTest(key)) { 140 throw new IllegalArgumentException("Key is not a member"); 141 } 142 143 int[] h = hash.hash(key); 144 hash.clear(); 145 146 for(int i = 0; i < nbHash; i++) { 147 // find the bucket 148 int wordNum = h[i] >> 4; // div 16 149 int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 150 151 long bucketMask = 15L << bucketShift; 152 long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; 153 154 // only decrement if the count in the bucket is between 0 and BUCKET_MAX_VALUE 155 if(bucketValue >= 1 && bucketValue < BUCKET_MAX_VALUE) { 156 // decrement by 1 157 buckets[wordNum] = (buckets[wordNum] & ~bucketMask) | ((bucketValue - 1) << bucketShift); 158 } 159 } 160 } 161 162 @Override 163 public void and(Filter filter) { 164 if(filter == null 165 || !(filter instanceof CountingBloomFilter) 166 || filter.vectorSize != this.vectorSize 167 || filter.nbHash != this.nbHash) { 168 throw new IllegalArgumentException("filters cannot be and-ed"); 169 } 170 CountingBloomFilter cbf = (CountingBloomFilter)filter; 171 172 int sizeInWords = buckets2words(vectorSize); 173 for(int i = 0; i < sizeInWords; i++) { 174 this.buckets[i] &= cbf.buckets[i]; 175 } 176 } 177 178 @Override 179 public boolean membershipTest(Key key) { 180 if(key == null) { 181 throw new NullPointerException("Key may not be null"); 182 } 183 184 int[] h = hash.hash(key); 185 hash.clear(); 186 187 for(int i = 0; i < nbHash; i++) { 188 // find the bucket 189 int wordNum = h[i] >> 4; // div 16 190 int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 191 192 long bucketMask = 15L << bucketShift; 193 194 if((buckets[wordNum] & bucketMask) == 0) { 195 return false; 196 } 197 } 198 199 return true; 200 } 201 202 /** 203 * This method calculates an approximate count of the key, i.e. how many 204 * times the key was added to the filter. This allows the filter to be 205 * used as an approximate <code>key -> count</code> map. 206 * <p>NOTE: due to the bucket size of this filter, inserting the same 207 * key more than 15 times will cause an overflow at all filter positions 208 * associated with this key, and it will significantly increase the error 209 * rate for this and other keys. For this reason the filter can only be 210 * used to store small count values <code>0 <= N << 15</code>. 211 * @param key key to be tested 212 * @return 0 if the key is not present. Otherwise, a positive value v will 213 * be returned such that <code>v == count</code> with probability equal to the 214 * error rate of this filter, and <code>v > count</code> otherwise. 215 * Additionally, if the filter experienced an underflow as a result of 216 * {@link #delete(Key)} operation, the return value may be lower than the 217 * <code>count</code> with the probability of the false negative rate of such 218 * filter. 219 */ 220 public int approximateCount(Key key) { 221 int res = Integer.MAX_VALUE; 222 int[] h = hash.hash(key); 223 hash.clear(); 224 for (int i = 0; i < nbHash; i++) { 225 // find the bucket 226 int wordNum = h[i] >> 4; // div 16 227 int bucketShift = (h[i] & 0x0f) << 2; // (mod 16) * 4 228 229 long bucketMask = 15L << bucketShift; 230 long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; 231 if (bucketValue < res) res = (int)bucketValue; 232 } 233 if (res != Integer.MAX_VALUE) { 234 return res; 235 } else { 236 return 0; 237 } 238 } 239 240 @Override 241 public void not() { 242 throw new UnsupportedOperationException("not() is undefined for " 243 + this.getClass().getName()); 244 } 245 246 @Override 247 public void or(Filter filter) { 248 if(filter == null 249 || !(filter instanceof CountingBloomFilter) 250 || filter.vectorSize != this.vectorSize 251 || filter.nbHash != this.nbHash) { 252 throw new IllegalArgumentException("filters cannot be or-ed"); 253 } 254 255 CountingBloomFilter cbf = (CountingBloomFilter)filter; 256 257 int sizeInWords = buckets2words(vectorSize); 258 for(int i = 0; i < sizeInWords; i++) { 259 this.buckets[i] |= cbf.buckets[i]; 260 } 261 } 262 263 @Override 264 public void xor(Filter filter) { 265 throw new UnsupportedOperationException("xor() is undefined for " 266 + this.getClass().getName()); 267 } 268 269 @Override 270 public String toString() { 271 StringBuilder res = new StringBuilder(); 272 273 for(int i = 0; i < vectorSize; i++) { 274 if(i > 0) { 275 res.append(" "); 276 } 277 278 int wordNum = i >> 4; // div 16 279 int bucketShift = (i & 0x0f) << 2; // (mod 16) * 4 280 281 long bucketMask = 15L << bucketShift; 282 long bucketValue = (buckets[wordNum] & bucketMask) >>> bucketShift; 283 284 res.append(bucketValue); 285 } 286 287 return res.toString(); 288 } 289 290 // Writable 291 292 @Override 293 public void write(DataOutput out) throws IOException { 294 super.write(out); 295 int sizeInWords = buckets2words(vectorSize); 296 for(int i = 0; i < sizeInWords; i++) { 297 out.writeLong(buckets[i]); 298 } 299 } 300 301 @Override 302 public void readFields(DataInput in) throws IOException { 303 super.readFields(in); 304 int sizeInWords = buckets2words(vectorSize); 305 buckets = new long[sizeInWords]; 306 for(int i = 0; i < sizeInWords; i++) { 307 buckets[i] = in.readLong(); 308 } 309 } 310}