001/** 002 * 003 * Copyright (c) 2005, European Commission project OneLab under contract 034819 (http://www.one-lab.org) 004 * All rights reserved. 005 * Redistribution and use in source and binary forms, with or 006 * without modification, are permitted provided that the following 007 * conditions are met: 008 * - Redistributions of source code must retain the above copyright 009 * notice, this list of conditions and the following disclaimer. 010 * - Redistributions in binary form must reproduce the above copyright 011 * notice, this list of conditions and the following disclaimer in 012 * the documentation and/or other materials provided with the distribution. 013 * - Neither the name of the University Catholique de Louvain - UCL 014 * nor the names of its contributors may be used to endorse or 015 * promote products derived from this software without specific prior 016 * written permission. 017 * 018 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 019 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 020 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 021 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 022 * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 023 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, 024 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 025 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 026 * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 027 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN 028 * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 029 * POSSIBILITY OF SUCH DAMAGE. 030 */ 031 032/** 033 * Licensed to the Apache Software Foundation (ASF) under one 034 * or more contributor license agreements. See the NOTICE file 035 * distributed with this work for additional information 036 * regarding copyright ownership. The ASF licenses this file 037 * to you under the Apache License, Version 2.0 (the 038 * "License"); you may not use this file except in compliance 039 * with the License. You may obtain a copy of the License at 040 * 041 * http://www.apache.org/licenses/LICENSE-2.0 042 * 043 * Unless required by applicable law or agreed to in writing, software 044 * distributed under the License is distributed on an "AS IS" BASIS, 045 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 046 * See the License for the specific language governing permissions and 047 * limitations under the License. 048 */ 049 050package org.apache.hadoop.util.bloom; 051 052import java.io.DataInput; 053import java.io.DataOutput; 054import java.io.IOException; 055 056import java.util.BitSet; 057 058import org.apache.hadoop.classification.InterfaceAudience; 059import org.apache.hadoop.classification.InterfaceStability; 060 061/** 062 * Implements a <i>Bloom filter</i>, as defined by Bloom in 1970. 063 * <p> 064 * The Bloom filter is a data structure that was introduced in 1970 and that has been adopted by 065 * the networking research community in the past decade thanks to the bandwidth efficiencies that it 066 * offers for the transmission of set membership information between networked hosts. A sender encodes 067 * the information into a bit vector, the Bloom filter, that is more compact than a conventional 068 * representation. Computation and space costs for construction are linear in the number of elements. 069 * The receiver uses the filter to test whether various elements are members of the set. Though the 070 * filter will occasionally return a false positive, it will never return a false negative. When creating 071 * the filter, the sender can choose its desired point in a trade-off between the false positive rate and the size. 072 * 073 * <p> 074 * Originally created by 075 * <a href="http://www.one-lab.org">European Commission One-Lab Project 034819</a>. 076 * 077 * @see Filter The general behavior of a filter 078 * 079 * @see <a href="http://portal.acm.org/citation.cfm?id=362692&dl=ACM&coll=portal">Space/Time Trade-Offs in Hash Coding with Allowable Errors</a> 080 */ 081@InterfaceAudience.Public 082@InterfaceStability.Stable 083public class BloomFilter extends Filter { 084 private static final byte[] bitvalues = new byte[] { 085 (byte)0x01, 086 (byte)0x02, 087 (byte)0x04, 088 (byte)0x08, 089 (byte)0x10, 090 (byte)0x20, 091 (byte)0x40, 092 (byte)0x80 093 }; 094 095 /** The bit vector. */ 096 BitSet bits; 097 098 /** Default constructor - use with readFields */ 099 public BloomFilter() { 100 super(); 101 } 102 103 /** 104 * Constructor 105 * @param vectorSize The vector size of <i>this</i> filter. 106 * @param nbHash The number of hash function to consider. 107 * @param hashType type of the hashing function (see 108 * {@link org.apache.hadoop.util.hash.Hash}). 109 */ 110 public BloomFilter(int vectorSize, int nbHash, int hashType) { 111 super(vectorSize, nbHash, hashType); 112 113 bits = new BitSet(this.vectorSize); 114 } 115 116 @Override 117 public void add(Key key) { 118 if(key == null) { 119 throw new NullPointerException("key cannot be null"); 120 } 121 122 int[] h = hash.hash(key); 123 hash.clear(); 124 125 for(int i = 0; i < nbHash; i++) { 126 bits.set(h[i]); 127 } 128 } 129 130 @Override 131 public void and(Filter filter) { 132 if(filter == null 133 || !(filter instanceof BloomFilter) 134 || filter.vectorSize != this.vectorSize 135 || filter.nbHash != this.nbHash) { 136 throw new IllegalArgumentException("filters cannot be and-ed"); 137 } 138 139 this.bits.and(((BloomFilter) filter).bits); 140 } 141 142 @Override 143 public boolean membershipTest(Key key) { 144 if(key == null) { 145 throw new NullPointerException("key cannot be null"); 146 } 147 148 int[] h = hash.hash(key); 149 hash.clear(); 150 for(int i = 0; i < nbHash; i++) { 151 if(!bits.get(h[i])) { 152 return false; 153 } 154 } 155 return true; 156 } 157 158 @Override 159 public void not() { 160 bits.flip(0, vectorSize); 161 } 162 163 @Override 164 public void or(Filter filter) { 165 if(filter == null 166 || !(filter instanceof BloomFilter) 167 || filter.vectorSize != this.vectorSize 168 || filter.nbHash != this.nbHash) { 169 throw new IllegalArgumentException("filters cannot be or-ed"); 170 } 171 bits.or(((BloomFilter) filter).bits); 172 } 173 174 @Override 175 public void xor(Filter filter) { 176 if(filter == null 177 || !(filter instanceof BloomFilter) 178 || filter.vectorSize != this.vectorSize 179 || filter.nbHash != this.nbHash) { 180 throw new IllegalArgumentException("filters cannot be xor-ed"); 181 } 182 bits.xor(((BloomFilter) filter).bits); 183 } 184 185 @Override 186 public String toString() { 187 return bits.toString(); 188 } 189 190 /** 191 * @return size of the the bloomfilter 192 */ 193 public int getVectorSize() { 194 return this.vectorSize; 195 } 196 197 // Writable 198 199 @Override 200 public void write(DataOutput out) throws IOException { 201 super.write(out); 202 byte[] bytes = new byte[getNBytes()]; 203 for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { 204 if (bitIndex == 8) { 205 bitIndex = 0; 206 byteIndex++; 207 } 208 if (bitIndex == 0) { 209 bytes[byteIndex] = 0; 210 } 211 if (bits.get(i)) { 212 bytes[byteIndex] |= bitvalues[bitIndex]; 213 } 214 } 215 out.write(bytes); 216 } 217 218 @Override 219 public void readFields(DataInput in) throws IOException { 220 super.readFields(in); 221 bits = new BitSet(this.vectorSize); 222 byte[] bytes = new byte[getNBytes()]; 223 in.readFully(bytes); 224 for(int i = 0, byteIndex = 0, bitIndex = 0; i < vectorSize; i++, bitIndex++) { 225 if (bitIndex == 8) { 226 bitIndex = 0; 227 byteIndex++; 228 } 229 if ((bytes[byteIndex] & bitvalues[bitIndex]) != 0) { 230 bits.set(i); 231 } 232 } 233 } 234 235 /* @return number of bytes needed to hold bit vector */ 236 private int getNBytes() { 237 return (vectorSize + 7) / 8; 238 } 239}//end class