001    /**
002     *
003     * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
004     * All rights reserved.
005     * Redistribution and use in source and binary forms, with or 
006     * without modification, are permitted provided that the following 
007     * conditions are met:
008     *  - Redistributions of source code must retain the above copyright 
009     *    notice, this list of conditions and the following disclaimer.
010     *  - Redistributions in binary form must reproduce the above copyright 
011     *    notice, this list of conditions and the following disclaimer in 
012     *    the documentation and/or other materials provided with the distribution.
013     *  - Neither the name of the University Catholique de Louvain - UCL
014     *    nor the names of its contributors may be used to endorse or 
015     *    promote products derived from this software without specific prior 
016     *    written permission.
017     *    
018     * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
019     * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
020     * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
021     * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
022     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
023     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
024     * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
025     * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
026     * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
027     * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
028     * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
029     * POSSIBILITY OF SUCH DAMAGE.
030     */
031    
032    /**
033     * Licensed to the Apache Software Foundation (ASF) under one
034     * or more contributor license agreements.  See the NOTICE file
035     * distributed with this work for additional information
036     * regarding copyright ownership.  The ASF licenses this file
037     * to you under the Apache License, Version 2.0 (the
038     * "License"); you may not use this file except in compliance
039     * with the License.  You may obtain a copy of the License at
040     *
041     *     http://www.apache.org/licenses/LICENSE-2.0
042     *
043     * Unless required by applicable law or agreed to in writing, software
044     * distributed under the License is distributed on an "AS IS" BASIS,
045     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
046     * See the License for the specific language governing permissions and
047     * limitations under the License.
048     */
049    package org.apache.hadoop.util.bloom;
050    
051    import java.io.DataInput;
052    import java.io.DataOutput;
053    import java.io.IOException;
054    import java.util.ArrayList;
055    import java.util.Collection;
056    import java.util.Collections;
057    import java.util.List;
058    import java.util.Random;
059    
060    import org.apache.hadoop.classification.InterfaceAudience;
061    import org.apache.hadoop.classification.InterfaceStability;
062    
063    /**
064     * Implements a <i>retouched Bloom filter</i>, as defined in the CoNEXT 2006 paper.
065     * <p>
066     * It allows the removal of selected false positives at the cost of introducing
067     * random false negatives, and with the benefit of eliminating some random false
068     * positives at the same time.
069     * 
070     * <p>
071     * Originally created by
072     * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
073     * 
074     * @see Filter The general behavior of a filter
075     * @see BloomFilter A Bloom filter
076     * @see RemoveScheme The different selective clearing algorithms
077     * 
078     * @see <a href="http://www-rp.lip6.fr/site_npa/site_rp/_publications/740-rbf_cameraready.pdf">Retouched Bloom Filters: Allowing Networked Applications to Trade Off Selected False Positives Against False Negatives</a>
079     */
080    @InterfaceAudience.Public
081    @InterfaceStability.Stable
082    public final class RetouchedBloomFilter extends BloomFilter
083    implements RemoveScheme {
084      /**
085       * KeyList vector (or ElementList Vector, as defined in the paper) of false positives.
086       */
087      List<Key>[] fpVector;
088    
089      /**
090       * KeyList vector of keys recorded in the filter.
091       */
092      List<Key>[] keyVector;
093    
094      /**
095       * Ratio vector.
096       */
097      double[] ratio;
098      
099      private Random rand;
100    
101      /** Default constructor - use with readFields */
102      public RetouchedBloomFilter() {}
103      
104      /**
105       * Constructor
106       * @param vectorSize The vector size of <i>this</i> filter.
107       * @param nbHash The number of hash function to consider.
108       * @param hashType type of the hashing function (see
109       * {@link org.apache.hadoop.util.hash.Hash}).
110       */
111      public RetouchedBloomFilter(int vectorSize, int nbHash, int hashType) {
112        super(vectorSize, nbHash, hashType);
113    
114        this.rand = null;
115        createVector();
116      }
117    
118      @Override
119      public void add(Key key) {
120        if (key == null) {
121          throw new NullPointerException("key can not be null");
122        }
123    
124        int[] h = hash.hash(key);
125        hash.clear();
126    
127        for (int i = 0; i < nbHash; i++) {
128          bits.set(h[i]);
129          keyVector[h[i]].add(key);
130        }
131      }
132    
133      /**
134       * Adds a false positive information to <i>this</i> retouched Bloom filter.
135       * <p>
136       * <b>Invariant</b>: if the false positive is <code>null</code>, nothing happens.
137       * @param key The false positive key to add.
138       */
139      public void addFalsePositive(Key key) {
140        if (key == null) {
141          throw new NullPointerException("key can not be null");
142        }
143    
144        int[] h = hash.hash(key);
145        hash.clear();
146    
147        for (int i = 0; i < nbHash; i++) {
148          fpVector[h[i]].add(key);
149        }
150      }
151    
152      /**
153       * Adds a collection of false positive information to <i>this</i> retouched Bloom filter.
154       * @param coll The collection of false positive.
155       */
156      public void addFalsePositive(Collection<Key> coll) {
157        if (coll == null) {
158          throw new NullPointerException("Collection<Key> can not be null");
159        }
160        
161        for (Key k : coll) {
162          addFalsePositive(k);
163        }
164      }
165    
166      /**
167       * Adds a list of false positive information to <i>this</i> retouched Bloom filter.
168       * @param keys The list of false positive.
169       */
170      public void addFalsePositive(List<Key> keys) {
171        if (keys == null) {
172          throw new NullPointerException("ArrayList<Key> can not be null");
173        }
174    
175        for (Key k : keys) {
176          addFalsePositive(k);
177        }
178      }
179    
180      /**
181       * Adds an array of false positive information to <i>this</i> retouched Bloom filter.
182       * @param keys The array of false positive.
183       */
184      public void addFalsePositive(Key[] keys) {
185        if (keys == null) {
186          throw new NullPointerException("Key[] can not be null");
187        }
188    
189        for (int i = 0; i < keys.length; i++) {
190          addFalsePositive(keys[i]);
191        }
192      }
193    
194      /**
195       * Performs the selective clearing for a given key.
196       * @param k The false positive key to remove from <i>this</i> retouched Bloom filter.
197       * @param scheme The selective clearing scheme to apply.
198       */
199      public void selectiveClearing(Key k, short scheme) {
200        if (k == null) {
201          throw new NullPointerException("Key can not be null");
202        }
203    
204        if (!membershipTest(k)) {
205          throw new IllegalArgumentException("Key is not a member");
206        }
207    
208        int index = 0;
209        int[] h = hash.hash(k);
210    
211        switch(scheme) {
212    
213        case RANDOM:
214          index = randomRemove();
215          break;
216        
217        case MINIMUM_FN:
218          index = minimumFnRemove(h);
219          break;
220        
221        case MAXIMUM_FP:
222          index = maximumFpRemove(h);
223          break;
224        
225        case RATIO:
226          index = ratioRemove(h);
227          break;
228        
229        default:
230          throw new AssertionError("Undefined selective clearing scheme");
231    
232        }
233    
234        clearBit(index);
235      }
236    
237      private int randomRemove() {
238        if (rand == null) {
239          rand = new Random();
240        }
241    
242        return rand.nextInt(nbHash);
243      }
244    
245      /**
246       * Chooses the bit position that minimizes the number of false negative generated.
247       * @param h The different bit positions.
248       * @return The position that minimizes the number of false negative generated.
249       */
250      private int minimumFnRemove(int[] h) {
251        int minIndex = Integer.MAX_VALUE;
252        double minValue = Double.MAX_VALUE;
253    
254        for (int i = 0; i < nbHash; i++) {
255          double keyWeight = getWeight(keyVector[h[i]]);
256    
257          if (keyWeight < minValue) {
258            minIndex = h[i];
259            minValue = keyWeight;
260          }
261    
262        }
263    
264        return minIndex;
265      }
266    
267      /**
268       * Chooses the bit position that maximizes the number of false positive removed.
269       * @param h The different bit positions.
270       * @return The position that maximizes the number of false positive removed.
271       */
272      private int maximumFpRemove(int[] h) {
273        int maxIndex = Integer.MIN_VALUE;
274        double maxValue = Double.MIN_VALUE;
275    
276        for (int i = 0; i < nbHash; i++) {
277          double fpWeight = getWeight(fpVector[h[i]]);
278    
279          if (fpWeight > maxValue) {
280            maxValue = fpWeight;
281            maxIndex = h[i];
282          }
283        }
284    
285        return maxIndex;
286      }
287    
288      /**
289       * Chooses the bit position that minimizes the number of false negative generated while maximizing.
290       * the number of false positive removed.
291       * @param h The different bit positions.
292       * @return The position that minimizes the number of false negative generated while maximizing.
293       */
294      private int ratioRemove(int[] h) {
295        computeRatio();
296        int minIndex = Integer.MAX_VALUE;
297        double minValue = Double.MAX_VALUE;
298    
299        for (int i = 0; i < nbHash; i++) {
300          if (ratio[h[i]] < minValue) {
301            minValue = ratio[h[i]];
302            minIndex = h[i];
303          }
304        }
305    
306        return minIndex;
307      }
308    
309      /**
310       * Clears a specified bit in the bit vector and keeps up-to-date the KeyList vectors.
311       * @param index The position of the bit to clear.
312       */
313      private void clearBit(int index) {
314        if (index < 0 || index >= vectorSize) {
315          throw new ArrayIndexOutOfBoundsException(index);
316        }
317    
318        List<Key> kl = keyVector[index];
319        List<Key> fpl = fpVector[index];
320    
321        // update key list
322        int listSize = kl.size();
323        for (int i = 0; i < listSize && !kl.isEmpty(); i++) {
324          removeKey(kl.get(0), keyVector);
325        }
326    
327        kl.clear();
328        keyVector[index].clear();
329    
330        //update false positive list
331        listSize = fpl.size();
332        for (int i = 0; i < listSize && !fpl.isEmpty(); i++) {
333          removeKey(fpl.get(0), fpVector);
334        }
335    
336        fpl.clear();
337        fpVector[index].clear();
338    
339        //update ratio
340        ratio[index] = 0.0;
341    
342        //update bit vector
343        bits.clear(index);
344      }
345    
346      /**
347       * Removes a given key from <i>this</i> filer.
348       * @param k The key to remove.
349       * @param vector The counting vector associated to the key.
350       */
351      private void removeKey(Key k, List<Key>[] vector) {
352        if (k == null) {
353          throw new NullPointerException("Key can not be null");
354        }
355        if (vector == null) {
356          throw new NullPointerException("ArrayList<Key>[] can not be null");
357        }
358    
359        int[] h = hash.hash(k);
360        hash.clear();
361    
362        for (int i = 0; i < nbHash; i++) {
363          vector[h[i]].remove(k);
364        }
365      }
366    
367      /**
368       * Computes the ratio A/FP.
369       */
370      private void computeRatio() {
371        for (int i = 0; i < vectorSize; i++) {
372          double keyWeight = getWeight(keyVector[i]);
373          double fpWeight = getWeight(fpVector[i]);
374    
375          if (keyWeight > 0 && fpWeight > 0) {
376            ratio[i] = keyWeight / fpWeight;
377          }
378        }
379      }
380    
381      private double getWeight(List<Key> keyList) {
382        double weight = 0.0;
383        for (Key k : keyList) {
384          weight += k.getWeight();
385        }
386        return weight;
387      }
388      
389      /**
390       * Creates and initialises the various vectors.
391       */
392      @SuppressWarnings("unchecked")
393      private void createVector() {
394        fpVector = new List[vectorSize];
395        keyVector = new List[vectorSize];
396        ratio = new double[vectorSize];
397    
398        for (int i = 0; i < vectorSize; i++) {
399          fpVector[i] = Collections.synchronizedList(new ArrayList<Key>());
400          keyVector[i] = Collections.synchronizedList(new ArrayList<Key>());
401          ratio[i] = 0.0;
402        }
403      }
404      
405      // Writable
406    
407      @Override
408      public void write(DataOutput out) throws IOException {
409        super.write(out);
410        for (int i = 0; i < fpVector.length; i++) {
411          List<Key> list = fpVector[i];
412          out.writeInt(list.size());
413          for (Key k : list) {
414            k.write(out);
415          }
416        }
417        for (int i = 0; i < keyVector.length; i++) {
418          List<Key> list = keyVector[i];
419          out.writeInt(list.size());
420          for (Key k : list) {
421            k.write(out);
422          }
423        }
424        for (int i = 0; i < ratio.length; i++) {
425          out.writeDouble(ratio[i]);
426        }
427      }
428    
429      @Override
430      public void readFields(DataInput in) throws IOException {
431        super.readFields(in);
432        createVector();
433        for (int i = 0; i < fpVector.length; i++) {
434          List<Key> list = fpVector[i];
435          int size = in.readInt();
436          for (int j = 0; j < size; j++) {
437            Key k = new Key();
438            k.readFields(in);
439            list.add(k);
440          }
441        }
442        for (int i = 0; i < keyVector.length; i++) {
443          List<Key> list = keyVector[i];
444          int size = in.readInt();
445          for (int j = 0; j < size; j++) {
446            Key k = new Key();
447            k.readFields(in);
448            list.add(k);
449          }
450        }
451        for (int i = 0; i < ratio.length; i++) {
452          ratio[i] = in.readDouble();
453        }
454      }
455    }