001    /**
002     *
003     * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
004     * All rights reserved.
005     * Redistribution and use in source and binary forms, with or 
006     * without modification, are permitted provided that the following 
007     * conditions are met:
008     *  - Redistributions of source code must retain the above copyright 
009     *    notice, this list of conditions and the following disclaimer.
010     *  - Redistributions in binary form must reproduce the above copyright 
011     *    notice, this list of conditions and the following disclaimer in 
012     *    the documentation and/or other materials provided with the distribution.
013     *  - Neither the name of the University Catholique de Louvain - UCL
014     *    nor the names of its contributors may be used to endorse or 
015     *    promote products derived from this software without specific prior 
016     *    written permission.
017     *    
018     * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
019     * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
020     * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
021     * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
022     * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
023     * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
024     * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
025     * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
026     * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
027     * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
028     * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
029     * POSSIBILITY OF SUCH DAMAGE.
030     */
031    
032    /**
033     * Licensed to the Apache Software Foundation (ASF) under one
034     * or more contributor license agreements.  See the NOTICE file
035     * distributed with this work for additional information
036     * regarding copyright ownership.  The ASF licenses this file
037     * to you under the Apache License, Version 2.0 (the
038     * "License"); you may not use this file except in compliance
039     * with the License.  You may obtain a copy of the License at
040     *
041     *     http://www.apache.org/licenses/LICENSE-2.0
042     *
043     * Unless required by applicable law or agreed to in writing, software
044     * distributed under the License is distributed on an "AS IS" BASIS,
045     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
046     * See the License for the specific language governing permissions and
047     * limitations under the License.
048     */
049    
050    package org.apache.hadoop.util.bloom;
051    
052    import java.io.DataInput;
053    import java.io.DataOutput;
054    import java.io.IOException;
055    
056    import java.util.BitSet;
057    
058    import org.apache.hadoop.classification.InterfaceAudience;
059    import org.apache.hadoop.classification.InterfaceStability;
060    
061    /**
062     * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
063     * <p>
064     * The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by 
065     * the networking research community in the past decade thanks to the bandwidth efficiencies that it
066     * offers for the transmission of set membership information between networked hosts.  A sender encodes 
067     * the information into a bit vector, the Bloom filter, that is more compact than a conventional 
068     * representation. Computation and space costs for construction are linear in the number of elements.  
069     * The receiver uses the filter to test whether various elements are members of the set. Though the 
070     * filter will occasionally return a false positive, it will never return a false negative. When creating 
071     * the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size. 
072     * 
073     * <p>
074     * Originally created by
075     * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
076     * 
077     * @see Filter The general behavior of a filter
078     * 
079     * @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
080     */
081    @InterfaceAudience.Public
082    @InterfaceStability.Stable
083    public class BloomFilter extends Filter {
084      private static final byte[] bitvalues = new byte[] {
085        (byte)0x01,
086        (byte)0x02,
087        (byte)0x04,
088        (byte)0x08,
089        (byte)0x10,
090        (byte)0x20,
091        (byte)0x40,
092        (byte)0x80
093      };
094      
095      /** The bit vector. */
096      BitSet bits;
097    
098      /** Default constructor - use with readFields */
099      public BloomFilter() {
100        super();
101      }
102      
103      /**
104       * Constructor
105       * @param vectorSize The vector size of <i>this</i> filter.
106       * @param nbHash The number of hash function to consider.
107       * @param hashType type of the hashing function (see
108       * {@link org.apache.hadoop.util.hash.Hash}).
109       */
110      public BloomFilter(int vectorSize, int nbHash, int hashType) {
111        super(vectorSize, nbHash, hashType);
112    
113        bits = new BitSet(this.vectorSize);
114      }
115    
116      @Override
117      public void add(Key key) {
118        if(key == null) {
119          throw new NullPointerException("key cannot be null");
120        }
121    
122        int[] h = hash.hash(key);
123        hash.clear();
124    
125        for(int i = 0; i < nbHash; i++) {
126          bits.set(h[i]);
127        }
128      }
129    
130      @Override
131      public void and(Filter filter) {
132        if(filter == null
133            || !(filter instanceof BloomFilter)
134            || filter.vectorSize != this.vectorSize
135            || filter.nbHash != this.nbHash) {
136          throw new IllegalArgumentException("filters cannot be and-ed");
137        }
138    
139        this.bits.and(((BloomFilter) filter).bits);
140      }
141    
142      @Override
143      public boolean membershipTest(Key key) {
144        if(key == null) {
145          throw new NullPointerException("key cannot be null");
146        }
147    
148        int[] h = hash.hash(key);
149        hash.clear();
150        for(int i = 0; i < nbHash; i++) {
151          if(!bits.get(h[i])) {
152            return false;
153          }
154        }
155        return true;
156      }
157    
158      @Override
159      public void not() {
160        bits.flip(0, vectorSize - 1);
161      }
162    
163      @Override
164      public void or(Filter filter) {
165        if(filter == null
166            || !(filter instanceof BloomFilter)
167            || filter.vectorSize != this.vectorSize
168            || filter.nbHash != this.nbHash) {
169          throw new IllegalArgumentException("filters cannot be or-ed");
170        }
171        bits.or(((BloomFilter) filter).bits);
172      }
173    
174      @Override
175      public void xor(Filter filter) {
176        if(filter == null
177            || !(filter instanceof BloomFilter)
178            || filter.vectorSize != this.vectorSize
179            || filter.nbHash != this.nbHash) {
180          throw new IllegalArgumentException("filters cannot be xor-ed");
181        }
182        bits.xor(((BloomFilter) filter).bits);
183      }
184    
185      @Override
186      public String toString() {
187        return bits.toString();
188      }
189    
190      /**
191       * @return size of the the bloomfilter
192       */
193      public int getVectorSize() {
194        return this.vectorSize;
195      }
196    
197      // Writable
198    
199      @Override
200      public void write(DataOutput out) throws IOException {
201        super.write(out);
202        byte[] bytes = new byte[getNBytes()];
203        for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
204          if (bitIndex == 8) {
205            bitIndex = 0;
206            byteIndex++;
207          }
208          if (bitIndex == 0) {
209            bytes[byteIndex] = 0;
210          }
211          if (bits.get(i)) {
212            bytes[byteIndex] |= bitvalues[bitIndex];
213          }
214        }
215        out.write(bytes);
216      }
217    
218      @Override
219      public void readFields(DataInput in) throws IOException {
220        super.readFields(in);
221        bits = new BitSet(this.vectorSize);
222        byte[] bytes = new byte[getNBytes()];
223        in.readFully(bytes);
224        for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
225          if (bitIndex == 8) {
226            bitIndex = 0;
227            byteIndex++;
228          }
229          if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
230            bits.set(i);
231          }
232        }
233      }
234      
235      /* @return number of bytes needed to hold bit vector */
236      private int getNBytes() {
237        return (vectorSize + 7) / 8;
238      }
239    }//end class