001/**
002 *
003 * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org)
004 * All rights reserved.
005 * Redistribution and use in source and binary forms, with or 
006 * without modification, are permitted provided that the following 
007 * conditions are met:
008 *  - Redistributions of source code must retain the above copyright 
009 *    notice, this list of conditions and the following disclaimer.
010 *  - Redistributions in binary form must reproduce the above copyright 
011 *    notice, this list of conditions and the following disclaimer in 
012 *    the documentation and/or other materials provided with the distribution.
013 *  - Neither the name of the University Catholique de Louvain - UCL
014 *    nor the names of its contributors may be used to endorse or 
015 *    promote products derived from this software without specific prior 
016 *    written permission.
017 *    
018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 
019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 
020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 
021 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 
022 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 
023 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 
024 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 
025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
026 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 
027 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 
028 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 
029 * POSSIBILITY OF SUCH DAMAGE.
030 */
031
032/**
033 * Licensed to the Apache Software Foundation (ASF) under one
034 * or more contributor license agreements.  See the NOTICE file
035 * distributed with this work for additional information
036 * regarding copyright ownership.  The ASF licenses this file
037 * to you under the Apache License, Version 2.0 (the
038 * "License"); you may not use this file except in compliance
039 * with the License.  You may obtain a copy of the License at
040 *
041 *     http://www.apache.org/licenses/LICENSE-2.0
042 *
043 * Unless required by applicable law or agreed to in writing, software
044 * distributed under the License is distributed on an "AS IS" BASIS,
045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
046 * See the License for the specific language governing permissions and
047 * limitations under the License.
048 */
049
050package org.apache.hadoop.util.bloom;
051
052import java.io.DataInput;
053import java.io.DataOutput;
054import java.io.IOException;
055
056import java.util.BitSet;
057
058import org.apache.hadoop.classification.InterfaceAudience;
059import org.apache.hadoop.classification.InterfaceStability;
060
061/**
062 * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970.
063 * <p>
064 * The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by 
065 * the networking research community in the past decade thanks to the bandwidth efficiencies that it
066 * offers for the transmission of set membership information between networked hosts.  A sender encodes 
067 * the information into a bit vector, the Bloom filter, that is more compact than a conventional 
068 * representation. Computation and space costs for construction are linear in the number of elements.  
069 * The receiver uses the filter to test whether various elements are members of the set. Though the 
070 * filter will occasionally return a false positive, it will never return a false negative. When creating 
071 * the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size. 
072 * 
073 * <p>
074 * Originally created by
075 * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>.
076 * 
077 * @see Filter The general behavior of a filter
078 * 
079 * @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a>
080 */
081@InterfaceAudience.Public
082@InterfaceStability.Stable
083public class BloomFilter extends Filter {
084  private static final byte[] bitvalues = new byte[] {
085    (byte)0x01,
086    (byte)0x02,
087    (byte)0x04,
088    (byte)0x08,
089    (byte)0x10,
090    (byte)0x20,
091    (byte)0x40,
092    (byte)0x80
093  };
094  
095  /** The bit vector. */
096  BitSet bits;
097
098  /** Default constructor - use with readFields */
099  public BloomFilter() {
100    super();
101  }
102  
103  /**
104   * Constructor
105   * @param vectorSize The vector size of <i>this</i> filter.
106   * @param nbHash The number of hash function to consider.
107   * @param hashType type of the hashing function (see
108   * {@link org.apache.hadoop.util.hash.Hash}).
109   */
110  public BloomFilter(int vectorSize, int nbHash, int hashType) {
111    super(vectorSize, nbHash, hashType);
112
113    bits = new BitSet(this.vectorSize);
114  }
115
116  @Override
117  public void add(Key key) {
118    if(key == null) {
119      throw new NullPointerException("key cannot be null");
120    }
121
122    int[] h = hash.hash(key);
123    hash.clear();
124
125    for(int i = 0; i < nbHash; i++) {
126      bits.set(h[i]);
127    }
128  }
129
130  @Override
131  public void and(Filter filter) {
132    if(filter == null
133        || !(filter instanceof BloomFilter)
134        || filter.vectorSize != this.vectorSize
135        || filter.nbHash != this.nbHash) {
136      throw new IllegalArgumentException("filters cannot be and-ed");
137    }
138
139    this.bits.and(((BloomFilter) filter).bits);
140  }
141
142  @Override
143  public boolean membershipTest(Key key) {
144    if(key == null) {
145      throw new NullPointerException("key cannot be null");
146    }
147
148    int[] h = hash.hash(key);
149    hash.clear();
150    for(int i = 0; i < nbHash; i++) {
151      if(!bits.get(h[i])) {
152        return false;
153      }
154    }
155    return true;
156  }
157
158  @Override
159  public void not() {
160    bits.flip(0, vectorSize);
161  }
162
163  @Override
164  public void or(Filter filter) {
165    if(filter == null
166        || !(filter instanceof BloomFilter)
167        || filter.vectorSize != this.vectorSize
168        || filter.nbHash != this.nbHash) {
169      throw new IllegalArgumentException("filters cannot be or-ed");
170    }
171    bits.or(((BloomFilter) filter).bits);
172  }
173
174  @Override
175  public void xor(Filter filter) {
176    if(filter == null
177        || !(filter instanceof BloomFilter)
178        || filter.vectorSize != this.vectorSize
179        || filter.nbHash != this.nbHash) {
180      throw new IllegalArgumentException("filters cannot be xor-ed");
181    }
182    bits.xor(((BloomFilter) filter).bits);
183  }
184
185  @Override
186  public String toString() {
187    return bits.toString();
188  }
189
190  /**
191   * @return size of the the bloomfilter
192   */
193  public int getVectorSize() {
194    return this.vectorSize;
195  }
196
197  // Writable
198
199  @Override
200  public void write(DataOutput out) throws IOException {
201    super.write(out);
202    byte[] bytes = new byte[getNBytes()];
203    for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
204      if (bitIndex == 8) {
205        bitIndex = 0;
206        byteIndex++;
207      }
208      if (bitIndex == 0) {
209        bytes[byteIndex] = 0;
210      }
211      if (bits.get(i)) {
212        bytes[byteIndex] |= bitvalues[bitIndex];
213      }
214    }
215    out.write(bytes);
216  }
217
218  @Override
219  public void readFields(DataInput in) throws IOException {
220    super.readFields(in);
221    bits = new BitSet(this.vectorSize);
222    byte[] bytes = new byte[getNBytes()];
223    in.readFully(bytes);
224    for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) {
225      if (bitIndex == 8) {
226        bitIndex = 0;
227        byteIndex++;
228      }
229      if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) {
230        bits.set(i);
231      }
232    }
233  }
234  
235  /* @return number of bytes needed to hold bit vector */
236  private int getNBytes() {
237    return (vectorSize + 7) / 8;
238  }
239}//end class