001/** 002 * 003 * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) 004 * All rights reserved. 005 * Redistribution and use in source and binary forms, with or 006 * without modification, are permitted provided that the following 007 * conditions are met: 008 * - Redistributions of source code must retain the above copyright 009 * notice, this list of conditions and the following disclaimer. 010 * - Redistributions in binary form must reproduce the above copyright 011 * notice, this list of conditions and the following disclaimer in 012 * the documentation and/or other materials provided with the distribution. 013 * - Neither the name of the University Catholique de Louvain - UCL 014 * nor the names of its contributors may be used to endorse or 015 * promote products derived from this software without specific prior 016 * written permission. 017 * 018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 021 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 022 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 023 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 024 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 026 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 027 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 028 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 029 * POSSIBILITY OF SUCH DAMAGE. 030 */ 031 032/** 033 * Licensed to the Apache Software Foundation (ASF) under one 034 * or more contributor license agreements. See the NOTICE file 035 * distributed with this work for additional information 036 * regarding copyright ownership. The ASF licenses this file 037 * to you under the Apache License, Version 2.0 (the 038 * "License"); you may not use this file except in compliance 039 * with the License. You may obtain a copy of the License at 040 * 041 * http://www.apache.org/licenses/LICENSE-2.0 042 * 043 * Unless required by applicable law or agreed to in writing, software 044 * distributed under the License is distributed on an "AS IS" BASIS, 045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 046 * See the License for the specific language governing permissions and 047 * limitations under the License. 048 */ 049package org.apache.hadoop.util.bloom; 050 051import java.io.DataInput; 052import java.io.DataOutput; 053import java.io.IOException; 054import java.util.ArrayList; 055import java.util.Collection; 056import java.util.Collections; 057import java.util.List; 058import java.util.Random; 059 060import org.apache.hadoop.classification.InterfaceAudience; 061import org.apache.hadoop.classification.InterfaceStability; 062 063/** 064 * Implements a <i>retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper. 065 * <p> 066 * It allows the removal of selected false positives at the cost of introducing 067 * random false negatives, and with the benefit of eliminating some random false 068 * positives at the same time. 069 * 070 * <p> 071 * Originally created by 072 * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>. 073 * 074 * @see Filter The general behavior of a filter 075 * @see BloomFilter A Bloom filter 076 * @see RemoveScheme The different selective clearing algorithms 077 * 078 * @see <a href="http://www-rp.lip6.fr/site_npa/site_rp/_publications/740-rbf_cameraready.pdf">Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives</a> 079 */ 080@InterfaceAudience.Public 081@InterfaceStability.Stable 082public final class RetouchedBloomFilter extends BloomFilter 083implements RemoveScheme { 084 /** 085 * KeyList vector (or ElementList Vector, as defined in the paper) of false positives. 086 */ 087 List<Key>[] fpVector; 088 089 /** 090 * KeyList vector of keys recorded in the filter. 091 */ 092 List<Key>[] keyVector; 093 094 /** 095 * Ratio vector. 096 */ 097 double[] ratio; 098 099 private Random rand; 100 101 /** Default constructor - use with readFields */ 102 public RetouchedBloomFilter() {} 103 104 /** 105 * Constructor 106 * @param vectorSize The vector size of <i>this</i> filter. 107 * @param nbHash The number of hash function to consider. 108 * @param hashType type of the hashing function (see 109 * {@link org.apache.hadoop.util.hash.Hash}). 110 */ 111 public RetouchedBloomFilter(int vectorSize, int nbHash, int hashType) { 112 super(vectorSize, nbHash, hashType); 113 114 this.rand = null; 115 createVector(); 116 } 117 118 @Override 119 public void add(Key key) { 120 if (key == null) { 121 throw new NullPointerException("key can not be null"); 122 } 123 124 int[] h = hash.hash(key); 125 hash.clear(); 126 127 for (int i = 0; i < nbHash; i++) { 128 bits.set(h[i]); 129 keyVector[h[i]].add(key); 130 } 131 } 132 133 /** 134 * Adds a false positive information to <i>this</i> retouched Bloom filter. 135 * <p> 136 * <b>Invariant</b>: if the false positive is <code>null</code>, nothing happens. 137 * @param key The false positive key to add. 138 */ 139 public void addFalsePositive(Key key) { 140 if (key == null) { 141 throw new NullPointerException("key can not be null"); 142 } 143 144 int[] h = hash.hash(key); 145 hash.clear(); 146 147 for (int i = 0; i < nbHash; i++) { 148 fpVector[h[i]].add(key); 149 } 150 } 151 152 /** 153 * Adds a collection of false positive information to <i>this</i> retouched Bloom filter. 154 * @param coll The collection of false positive. 155 */ 156 public void addFalsePositive(Collection<Key> coll) { 157 if (coll == null) { 158 throw new NullPointerException("Collection<Key> can not be null"); 159 } 160 161 for (Key k : coll) { 162 addFalsePositive(k); 163 } 164 } 165 166 /** 167 * Adds a list of false positive information to <i>this</i> retouched Bloom filter. 168 * @param keys The list of false positive. 169 */ 170 public void addFalsePositive(List<Key> keys) { 171 if (keys == null) { 172 throw new NullPointerException("ArrayList<Key> can not be null"); 173 } 174 175 for (Key k : keys) { 176 addFalsePositive(k); 177 } 178 } 179 180 /** 181 * Adds an array of false positive information to <i>this</i> retouched Bloom filter. 182 * @param keys The array of false positive. 183 */ 184 public void addFalsePositive(Key[] keys) { 185 if (keys == null) { 186 throw new NullPointerException("Key[] can not be null"); 187 } 188 189 for (int i = 0; i < keys.length; i++) { 190 addFalsePositive(keys[i]); 191 } 192 } 193 194 /** 195 * Performs the selective clearing for a given key. 196 * @param k The false positive key to remove from <i>this</i> retouched Bloom filter. 197 * @param scheme The selective clearing scheme to apply. 198 */ 199 public void selectiveClearing(Key k, short scheme) { 200 if (k == null) { 201 throw new NullPointerException("Key can not be null"); 202 } 203 204 if (!membershipTest(k)) { 205 throw new IllegalArgumentException("Key is not a member"); 206 } 207 208 int index = 0; 209 int[] h = hash.hash(k); 210 211 switch(scheme) { 212 213 case RANDOM: 214 index = randomRemove(); 215 break; 216 217 case MINIMUM_FN: 218 index = minimumFnRemove(h); 219 break; 220 221 case MAXIMUM_FP: 222 index = maximumFpRemove(h); 223 break; 224 225 case RATIO: 226 index = ratioRemove(h); 227 break; 228 229 default: 230 throw new AssertionError("Undefined selective clearing scheme"); 231 232 } 233 234 clearBit(index); 235 } 236 237 private int randomRemove() { 238 if (rand == null) { 239 rand = new Random(); 240 } 241 242 return rand.nextInt(nbHash); 243 } 244 245 /** 246 * Chooses the bit position that minimizes the number of false negative generated. 247 * @param h The different bit positions. 248 * @return The position that minimizes the number of false negative generated. 249 */ 250 private int minimumFnRemove(int[] h) { 251 int minIndex = Integer.MAX_VALUE; 252 double minValue = Double.MAX_VALUE; 253 254 for (int i = 0; i < nbHash; i++) { 255 double keyWeight = getWeight(keyVector[h[i]]); 256 257 if (keyWeight < minValue) { 258 minIndex = h[i]; 259 minValue = keyWeight; 260 } 261 262 } 263 264 return minIndex; 265 } 266 267 /** 268 * Chooses the bit position that maximizes the number of false positive removed. 269 * @param h The different bit positions. 270 * @return The position that maximizes the number of false positive removed. 271 */ 272 private int maximumFpRemove(int[] h) { 273 int maxIndex = Integer.MIN_VALUE; 274 double maxValue = Double.MIN_VALUE; 275 276 for (int i = 0; i < nbHash; i++) { 277 double fpWeight = getWeight(fpVector[h[i]]); 278 279 if (fpWeight > maxValue) { 280 maxValue = fpWeight; 281 maxIndex = h[i]; 282 } 283 } 284 285 return maxIndex; 286 } 287 288 /** 289 * Chooses the bit position that minimizes the number of false negative generated while maximizing. 290 * the number of false positive removed. 291 * @param h The different bit positions. 292 * @return The position that minimizes the number of false negative generated while maximizing. 293 */ 294 private int ratioRemove(int[] h) { 295 computeRatio(); 296 int minIndex = Integer.MAX_VALUE; 297 double minValue = Double.MAX_VALUE; 298 299 for (int i = 0; i < nbHash; i++) { 300 if (ratio[h[i]] < minValue) { 301 minValue = ratio[h[i]]; 302 minIndex = h[i]; 303 } 304 } 305 306 return minIndex; 307 } 308 309 /** 310 * Clears a specified bit in the bit vector and keeps up-to-date the KeyList vectors. 311 * @param index The position of the bit to clear. 312 */ 313 private void clearBit(int index) { 314 if (index < 0 || index >= vectorSize) { 315 throw new ArrayIndexOutOfBoundsException(index); 316 } 317 318 List<Key> kl = keyVector[index]; 319 List<Key> fpl = fpVector[index]; 320 321 // update key list 322 int listSize = kl.size(); 323 for (int i = 0; i < listSize && !kl.isEmpty(); i++) { 324 removeKey(kl.get(0), keyVector); 325 } 326 327 kl.clear(); 328 keyVector[index].clear(); 329 330 //update false positive list 331 listSize = fpl.size(); 332 for (int i = 0; i < listSize && !fpl.isEmpty(); i++) { 333 removeKey(fpl.get(0), fpVector); 334 } 335 336 fpl.clear(); 337 fpVector[index].clear(); 338 339 //update ratio 340 ratio[index] = 0.0; 341 342 //update bit vector 343 bits.clear(index); 344 } 345 346 /** 347 * Removes a given key from <i>this</i> filer. 348 * @param k The key to remove. 349 * @param vector The counting vector associated to the key. 350 */ 351 private void removeKey(Key k, List<Key>[] vector) { 352 if (k == null) { 353 throw new NullPointerException("Key can not be null"); 354 } 355 if (vector == null) { 356 throw new NullPointerException("ArrayList<Key>[] can not be null"); 357 } 358 359 int[] h = hash.hash(k); 360 hash.clear(); 361 362 for (int i = 0; i < nbHash; i++) { 363 vector[h[i]].remove(k); 364 } 365 } 366 367 /** 368 * Computes the ratio A/FP. 369 */ 370 private void computeRatio() { 371 for (int i = 0; i < vectorSize; i++) { 372 double keyWeight = getWeight(keyVector[i]); 373 double fpWeight = getWeight(fpVector[i]); 374 375 if (keyWeight > 0 && fpWeight > 0) { 376 ratio[i] = keyWeight / fpWeight; 377 } 378 } 379 } 380 381 private double getWeight(List<Key> keyList) { 382 double weight = 0.0; 383 for (Key k : keyList) { 384 weight += k.getWeight(); 385 } 386 return weight; 387 } 388 389 /** 390 * Creates and initialises the various vectors. 391 */ 392 @SuppressWarnings("unchecked") 393 private void createVector() { 394 fpVector = new List[vectorSize]; 395 keyVector = new List[vectorSize]; 396 ratio = new double[vectorSize]; 397 398 for (int i = 0; i < vectorSize; i++) { 399 fpVector[i] = Collections.synchronizedList(new ArrayList<Key>()); 400 keyVector[i] = Collections.synchronizedList(new ArrayList<Key>()); 401 ratio[i] = 0.0; 402 } 403 } 404 405 // Writable 406 407 @Override 408 public void write(DataOutput out) throws IOException { 409 super.write(out); 410 for (int i = 0; i < fpVector.length; i++) { 411 List<Key> list = fpVector[i]; 412 out.writeInt(list.size()); 413 for (Key k : list) { 414 k.write(out); 415 } 416 } 417 for (int i = 0; i < keyVector.length; i++) { 418 List<Key> list = keyVector[i]; 419 out.writeInt(list.size()); 420 for (Key k : list) { 421 k.write(out); 422 } 423 } 424 for (int i = 0; i < ratio.length; i++) { 425 out.writeDouble(ratio[i]); 426 } 427 } 428 429 @Override 430 public void readFields(DataInput in) throws IOException { 431 super.readFields(in); 432 createVector(); 433 for (int i = 0; i < fpVector.length; i++) { 434 List<Key> list = fpVector[i]; 435 int size = in.readInt(); 436 for (int j = 0; j < size; j++) { 437 Key k = new Key(); 438 k.readFields(in); 439 list.add(k); 440 } 441 } 442 for (int i = 0; i < keyVector.length; i++) { 443 List<Key> list = keyVector[i]; 444 int size = in.readInt(); 445 for (int j = 0; j < size; j++) { 446 Key k = new Key(); 447 k.readFields(in); 448 list.add(k); 449 } 450 } 451 for (int i = 0; i < ratio.length; i++) { 452 ratio[i] = in.readDouble(); 453 } 454 } 455}