001/**
002 *
003 * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
004 * All rights reserved.
005 * Redistribution and use in source and binary forms, with or 
006 * without modification, are permitted provided that the following 
007 * conditions are met:
008 *  - Redistributions of source code must retain the above copyright 
009 *    notice, this list of conditions and the following disclaimer.
010 *  - Redistributions in binary form must reproduce the above copyright 
011 *    notice, this list of conditions and the following disclaimer in 
012 *    the documentation and/or other materials provided with the distribution.
013 *  - Neither the name of the University Catholique de Louvain - UCL
014 *    nor the names of its contributors may be used to endorse or 
015 *    promote products derived from this software without specific prior 
016 *    written permission.
017 *    
018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
021 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
022 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
023 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
024 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
026 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
027 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
028 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
029 * POSSIBILITY OF SUCH DAMAGE.
030 */
031
032/**
033 * Licensed to the Apache Software Foundation (ASF) under one
034 * or more contributor license agreements.  See the NOTICE file
035 * distributed with this work for additional information
036 * regarding copyright ownership.  The ASF licenses this file
037 * to you under the Apache License, Version 2.0 (the
038 * "License"); you may not use this file except in compliance
039 * with the License.  You may obtain a copy of the License at
040 *
041 *     http://www.apache.org/licenses/LICENSE-2.0
042 *
043 * Unless required by applicable law or agreed to in writing, software
044 * distributed under the License is distributed on an "AS IS" BASIS,
045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
046 * See the License for the specific language governing permissions and
047 * limitations under the License.
048 */
049package org.apache.hadoop.util.bloom;
050
051import java.io.DataInput;
052import java.io.DataOutput;
053import java.io.IOException;
054import java.util.ArrayList;
055import java.util.Collection;
056import java.util.Collections;
057import java.util.List;
058import java.util.Random;
059
060import org.apache.hadoop.classification.InterfaceAudience;
061import org.apache.hadoop.classification.InterfaceStability;
062
063/**
064 * Implements a <i>retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper.
065 * <p>
066 * It allows the removal of selected false positives at the cost of introducing
067 * random false negatives, and with the benefit of eliminating some random false
068 * positives at the same time.
069 * 
070 * <p>
071 * Originally created by
072 * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
073 * 
074 * @see Filter The general behavior of a filter
075 * @see BloomFilter A Bloom filter
076 * @see RemoveScheme The different selective clearing algorithms
077 * 
078 * @see <a href="http://www-rp.lip6.fr/site_npa/site_rp/_publications/740-rbf_cameraready.pdf">Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives</a>
079 */
080@InterfaceAudience.Public
081@InterfaceStability.Stable
082public final class RetouchedBloomFilter extends BloomFilter
083implements RemoveScheme {
084  /**
085   * KeyList vector (or ElementList Vector, as defined in the paper) of false positives.
086   */
087  List<Key>[] fpVector;
088
089  /**
090   * KeyList vector of keys recorded in the filter.
091   */
092  List<Key>[] keyVector;
093
094  /**
095   * Ratio vector.
096   */
097  double[] ratio;
098  
099  private Random rand;
100
101  /** Default constructor - use with readFields */
102  public RetouchedBloomFilter() {}
103  
104  /**
105   * Constructor
106   * @param vectorSize The vector size of <i>this</i> filter.
107   * @param nbHash The number of hash function to consider.
108   * @param hashType type of the hashing function (see
109   * {@link org.apache.hadoop.util.hash.Hash}).
110   */
111  public RetouchedBloomFilter(int vectorSize, int nbHash, int hashType) {
112    super(vectorSize, nbHash, hashType);
113
114    this.rand = null;
115    createVector();
116  }
117
118  @Override
119  public void add(Key key) {
120    if (key == null) {
121      throw new NullPointerException("key can not be null");
122    }
123
124    int[] h = hash.hash(key);
125    hash.clear();
126
127    for (int i = 0; i < nbHash; i++) {
128      bits.set(h[i]);
129      keyVector[h[i]].add(key);
130    }
131  }
132
133  /**
134   * Adds a false positive information to <i>this</i> retouched Bloom filter.
135   * <p>
136   * <b>Invariant</b>: if the false positive is <code>null</code>, nothing happens.
137   * @param key The false positive key to add.
138   */
139  public void addFalsePositive(Key key) {
140    if (key == null) {
141      throw new NullPointerException("key can not be null");
142    }
143
144    int[] h = hash.hash(key);
145    hash.clear();
146
147    for (int i = 0; i < nbHash; i++) {
148      fpVector[h[i]].add(key);
149    }
150  }
151
152  /**
153   * Adds a collection of false positive information to <i>this</i> retouched Bloom filter.
154   * @param coll The collection of false positive.
155   */
156  public void addFalsePositive(Collection<Key> coll) {
157    if (coll == null) {
158      throw new NullPointerException("Collection<Key> can not be null");
159    }
160    
161    for (Key k : coll) {
162      addFalsePositive(k);
163    }
164  }
165
166  /**
167   * Adds a list of false positive information to <i>this</i> retouched Bloom filter.
168   * @param keys The list of false positive.
169   */
170  public void addFalsePositive(List<Key> keys) {
171    if (keys == null) {
172      throw new NullPointerException("ArrayList<Key> can not be null");
173    }
174
175    for (Key k : keys) {
176      addFalsePositive(k);
177    }
178  }
179
180  /**
181   * Adds an array of false positive information to <i>this</i> retouched Bloom filter.
182   * @param keys The array of false positive.
183   */
184  public void addFalsePositive(Key[] keys) {
185    if (keys == null) {
186      throw new NullPointerException("Key[] can not be null");
187    }
188
189    for (int i = 0; i < keys.length; i++) {
190      addFalsePositive(keys[i]);
191    }
192  }
193
194  /**
195   * Performs the selective clearing for a given key.
196   * @param k The false positive key to remove from <i>this</i> retouched Bloom filter.
197   * @param scheme The selective clearing scheme to apply.
198   */
199  public void selectiveClearing(Key k, short scheme) {
200    if (k == null) {
201      throw new NullPointerException("Key can not be null");
202    }
203
204    if (!membershipTest(k)) {
205      throw new IllegalArgumentException("Key is not a member");
206    }
207
208    int index = 0;
209    int[] h = hash.hash(k);
210
211    switch(scheme) {
212
213    case RANDOM:
214      index = randomRemove();
215      break;
216    
217    case MINIMUM_FN:
218      index = minimumFnRemove(h);
219      break;
220    
221    case MAXIMUM_FP:
222      index = maximumFpRemove(h);
223      break;
224    
225    case RATIO:
226      index = ratioRemove(h);
227      break;
228    
229    default:
230      throw new AssertionError("Undefined selective clearing scheme");
231
232    }
233
234    clearBit(index);
235  }
236
237  private int randomRemove() {
238    if (rand == null) {
239      rand = new Random();
240    }
241
242    return rand.nextInt(nbHash);
243  }
244
245  /**
246   * Chooses the bit position that minimizes the number of false negative generated.
247   * @param h The different bit positions.
248   * @return The position that minimizes the number of false negative generated.
249   */
250  private int minimumFnRemove(int[] h) {
251    int minIndex = Integer.MAX_VALUE;
252    double minValue = Double.MAX_VALUE;
253
254    for (int i = 0; i < nbHash; i++) {
255      double keyWeight = getWeight(keyVector[h[i]]);
256
257      if (keyWeight < minValue) {
258        minIndex = h[i];
259        minValue = keyWeight;
260      }
261
262    }
263
264    return minIndex;
265  }
266
267  /**
268   * Chooses the bit position that maximizes the number of false positive removed.
269   * @param h The different bit positions.
270   * @return The position that maximizes the number of false positive removed.
271   */
272  private int maximumFpRemove(int[] h) {
273    int maxIndex = Integer.MIN_VALUE;
274    double maxValue = Double.MIN_VALUE;
275
276    for (int i = 0; i < nbHash; i++) {
277      double fpWeight = getWeight(fpVector[h[i]]);
278
279      if (fpWeight > maxValue) {
280        maxValue = fpWeight;
281        maxIndex = h[i];
282      }
283    }
284
285    return maxIndex;
286  }
287
288  /**
289   * Chooses the bit position that minimizes the number of false negative generated while maximizing.
290   * the number of false positive removed.
291   * @param h The different bit positions.
292   * @return The position that minimizes the number of false negative generated while maximizing.
293   */
294  private int ratioRemove(int[] h) {
295    computeRatio();
296    int minIndex = Integer.MAX_VALUE;
297    double minValue = Double.MAX_VALUE;
298
299    for (int i = 0; i < nbHash; i++) {
300      if (ratio[h[i]] < minValue) {
301        minValue = ratio[h[i]];
302        minIndex = h[i];
303      }
304    }
305
306    return minIndex;
307  }
308
309  /**
310   * Clears a specified bit in the bit vector and keeps up-to-date the KeyList vectors.
311   * @param index The position of the bit to clear.
312   */
313  private void clearBit(int index) {
314    if (index < 0 || index >= vectorSize) {
315      throw new ArrayIndexOutOfBoundsException(index);
316    }
317
318    List<Key> kl = keyVector[index];
319    List<Key> fpl = fpVector[index];
320
321    // update key list
322    int listSize = kl.size();
323    for (int i = 0; i < listSize && !kl.isEmpty(); i++) {
324      removeKey(kl.get(0), keyVector);
325    }
326
327    kl.clear();
328    keyVector[index].clear();
329
330    //update false positive list
331    listSize = fpl.size();
332    for (int i = 0; i < listSize && !fpl.isEmpty(); i++) {
333      removeKey(fpl.get(0), fpVector);
334    }
335
336    fpl.clear();
337    fpVector[index].clear();
338
339    //update ratio
340    ratio[index] = 0.0;
341
342    //update bit vector
343    bits.clear(index);
344  }
345
346  /**
347   * Removes a given key from <i>this</i> filer.
348   * @param k The key to remove.
349   * @param vector The counting vector associated to the key.
350   */
351  private void removeKey(Key k, List<Key>[] vector) {
352    if (k == null) {
353      throw new NullPointerException("Key can not be null");
354    }
355    if (vector == null) {
356      throw new NullPointerException("ArrayList<Key>[] can not be null");
357    }
358
359    int[] h = hash.hash(k);
360    hash.clear();
361
362    for (int i = 0; i < nbHash; i++) {
363      vector[h[i]].remove(k);
364    }
365  }
366
367  /**
368   * Computes the ratio A/FP.
369   */
370  private void computeRatio() {
371    for (int i = 0; i < vectorSize; i++) {
372      double keyWeight = getWeight(keyVector[i]);
373      double fpWeight = getWeight(fpVector[i]);
374
375      if (keyWeight > 0 && fpWeight > 0) {
376        ratio[i] = keyWeight / fpWeight;
377      }
378    }
379  }
380
381  private double getWeight(List<Key> keyList) {
382    double weight = 0.0;
383    for (Key k : keyList) {
384      weight += k.getWeight();
385    }
386    return weight;
387  }
388  
389  /**
390   * Creates and initialises the various vectors.
391   */
392  @SuppressWarnings("unchecked")
393  private void createVector() {
394    fpVector = new List[vectorSize];
395    keyVector = new List[vectorSize];
396    ratio = new double[vectorSize];
397
398    for (int i = 0; i < vectorSize; i++) {
399      fpVector[i] = Collections.synchronizedList(new ArrayList<Key>());
400      keyVector[i] = Collections.synchronizedList(new ArrayList<Key>());
401      ratio[i] = 0.0;
402    }
403  }
404  
405  // Writable
406
407  @Override
408  public void write(DataOutput out) throws IOException {
409    super.write(out);
410    for (int i = 0; i < fpVector.length; i++) {
411      List<Key> list = fpVector[i];
412      out.writeInt(list.size());
413      for (Key k : list) {
414        k.write(out);
415      }
416    }
417    for (int i = 0; i < keyVector.length; i++) {
418      List<Key> list = keyVector[i];
419      out.writeInt(list.size());
420      for (Key k : list) {
421        k.write(out);
422      }
423    }
424    for (int i = 0; i < ratio.length; i++) {
425      out.writeDouble(ratio[i]);
426    }
427  }
428
429  @Override
430  public void readFields(DataInput in) throws IOException {
431    super.readFields(in);
432    createVector();
433    for (int i = 0; i < fpVector.length; i++) {
434      List<Key> list = fpVector[i];
435      int size = in.readInt();
436      for (int j = 0; j < size; j++) {
437        Key k = new Key();
438        k.readFields(in);
439        list.add(k);
440      }
441    }
442    for (int i = 0; i < keyVector.length; i++) {
443      List<Key> list = keyVector[i];
444      int size = in.readInt();
445      for (int j = 0; j < size; j++) {
446        Key k = new Key();
447        k.readFields(in);
448        list.add(k);
449      }
450    }
451    for (int i = 0; i < ratio.length; i++) {
452      ratio[i] = in.readDouble();
453    }
454  }
455}