1 | /** |
2 | * This program is free software: you can redistribute it and/or modify |
3 | * it under the terms of the GNU Lesser General Public License as published by |
4 | * the Free Software Foundation, either version 3 of the License, or |
5 | * (at your option) any later version. |
6 | * |
7 | * This program is distributed in the hope that it will be useful, |
8 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
10 | * GNU Lesser General Public License for more details. |
11 | * |
12 | * You should have received a copy of the GNU Lesser General Public License |
13 | * along with this program. If not, see <http://www.gnu.org/licenses/>. |
14 | */ |
15 | |
16 | package felix.util; |
17 | |
18 | import java.io.Serializable; |
19 | import java.nio.charset.Charset; |
20 | import java.security.MessageDigest; |
21 | import java.security.NoSuchAlgorithmException; |
22 | import java.util.BitSet; |
23 | import java.util.Collection; |
24 | |
25 | /** |
26 | * Implementation of a Bloom-filter, as described here: |
27 | * http://en.wikipedia.org/wiki/Bloom_filter |
28 | * |
29 | * Inspired by the SimpleBloomFilter-class written by Ian Clarke. This |
30 | * implementation provides a more evenly distributed Hash-function by |
31 | * using a proper digest instead of the Java RNG. Many of the changes |
32 | * were proposed in comments in his blog: |
33 | * http://blog.locut.us/2008/01/12/a-decent-stand-alone-java-bloom-filter-implementation/ |
34 | * |
35 | * @param <E> Object type that is to be inserted into the Bloom filter, e.g. String or Integer. |
36 | * @author Magnus Skjegstad <magnus@skjegstad.com> |
37 | */ |
38 | public class BloomFilter<E> implements Serializable { |
39 | private BitSet bitset; |
40 | private int bitSetSize; |
41 | private double bitsPerElement; |
42 | private int expectedNumberOfFilterElements; // expected (maximum) number of elements to be added |
43 | private int numberOfAddedElements; // number of elements actually added to the Bloom filter |
44 | private int k; // number of hash functions |
45 | |
46 | static final Charset charset = Charset.forName("UTF-8"); // encoding used for storing hash values as strings |
47 | |
48 | static final String hashName = "MD5"; // MD5 gives good enough accuracy in most circumstances. Change to SHA1 if it's needed |
49 | static final MessageDigest digestFunction; |
50 | static { // The digest method is reused between instances |
51 | MessageDigest tmp; |
52 | try { |
53 | tmp = java.security.MessageDigest.getInstance(hashName); |
54 | } catch (NoSuchAlgorithmException e) { |
55 | tmp = null; |
56 | } |
57 | digestFunction = tmp; |
58 | } |
59 | |
60 | /** |
61 | * Constructs an empty Bloom filter. The total length of the Bloom filter will be |
62 | * c*n. |
63 | * |
64 | * @param c is the number of bits used per element. |
65 | * @param n is the expected number of elements the filter will contain. |
66 | * @param k is the number of hash functions used. |
67 | */ |
68 | public BloomFilter(double c, int n, int k) { |
69 | this.expectedNumberOfFilterElements = n; |
70 | this.k = k; |
71 | this.bitsPerElement = c; |
72 | this.bitSetSize = (int)Math.ceil(c * n); |
73 | numberOfAddedElements = 0; |
74 | this.bitset = new BitSet(bitSetSize); |
75 | } |
76 | |
77 | /** |
78 | * Constructs an empty Bloom filter. The optimal number of hash functions (k) is estimated from the total size of the Bloom |
79 | * and the number of expected elements. |
80 | * |
81 | * @param bitSetSize defines how many bits should be used in total for the filter. |
82 | * @param expectedNumberOElements defines the maximum number of elements the filter is expected to contain. |
83 | */ |
84 | public BloomFilter(int bitSetSize, int expectedNumberOElements) { |
85 | this(bitSetSize / (double)expectedNumberOElements, |
86 | expectedNumberOElements, |
87 | (int) Math.round((bitSetSize / (double)expectedNumberOElements) * Math.log(2.0))); |
88 | } |
89 | |
90 | /** |
91 | * Constructs an empty Bloom filter with a given false positive probability. The number of bits per |
92 | * element and the number of hash functions is estimated |
93 | * to match the false positive probability. |
94 | * |
95 | * @param falsePositiveProbability is the desired false positive probability. |
96 | * @param expectedNumberOfElements is the expected number of elements in the Bloom filter. |
97 | */ |
98 | public BloomFilter(double falsePositiveProbability, int expectedNumberOfElements) { |
99 | this(Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2))) / Math.log(2), // c = k / ln(2) |
100 | expectedNumberOfElements, |
101 | (int)Math.ceil(-(Math.log(falsePositiveProbability) / Math.log(2)))); // k = ceil(-log_2(false prob.)) |
102 | } |
103 | |
104 | /** |
105 | * Construct a new Bloom filter based on existing Bloom filter data. |
106 | * |
107 | * @param bitSetSize defines how many bits should be used for the filter. |
108 | * @param expectedNumberOfFilterElements defines the maximum number of elements the filter is expected to contain. |
109 | * @param actualNumberOfFilterElements specifies how many elements have been inserted into the <code>filterData</code> BitSet. |
110 | * @param filterData a BitSet representing an existing Bloom filter. |
111 | */ |
112 | public BloomFilter(int bitSetSize, int expectedNumberOfFilterElements, int actualNumberOfFilterElements, BitSet filterData) { |
113 | this(bitSetSize, expectedNumberOfFilterElements); |
114 | this.bitset = filterData; |
115 | this.numberOfAddedElements = actualNumberOfFilterElements; |
116 | } |
117 | |
118 | /** |
119 | * Generates a digest based on the contents of a String. |
120 | * |
121 | * @param val specifies the input data. |
122 | * @param charset specifies the encoding of the input data. |
123 | * @return digest as long. |
124 | */ |
125 | public static long createHash(String val, Charset charset) { |
126 | return createHash(val.getBytes(charset)); |
127 | } |
128 | |
129 | /** |
130 | * Generates a digest based on the contents of a String. |
131 | * |
132 | * @param val specifies the input data. The encoding is expected to be UTF-8. |
133 | * @return digest as long. |
134 | */ |
135 | public static long createHash(String val) { |
136 | return createHash(val, charset); |
137 | } |
138 | |
139 | /** |
140 | * Generates a digest based on the contents of an array of bytes. |
141 | * |
142 | * @param data specifies input data. |
143 | * @return digest as long. |
144 | */ |
145 | public static long createHash(byte[] data) { |
146 | long h = 0; |
147 | byte[] res; |
148 | |
149 | synchronized (digestFunction) { |
150 | res = digestFunction.digest(data); |
151 | } |
152 | |
153 | for (int i = 0; i < 4; i++) { |
154 | h <<= 8; |
155 | h |= ((int) res[i]) & 0xFF; |
156 | } |
157 | return h; |
158 | } |
159 | |
160 | /** |
161 | * Compares the contents of two instances to see if they are equal. |
162 | * |
163 | * @param obj is the object to compare to. |
164 | * @return True if the contents of the objects are equal. |
165 | */ |
166 | @Override |
167 | public boolean equals(Object obj) { |
168 | if (obj == null) { |
169 | return false; |
170 | } |
171 | if (getClass() != obj.getClass()) { |
172 | return false; |
173 | } |
174 | final BloomFilter<E> other = (BloomFilter<E>) obj; |
175 | if (this.expectedNumberOfFilterElements != other.expectedNumberOfFilterElements) { |
176 | return false; |
177 | } |
178 | if (this.k != other.k) { |
179 | return false; |
180 | } |
181 | if (this.bitSetSize != other.bitSetSize) { |
182 | return false; |
183 | } |
184 | if (this.bitset != other.bitset && (this.bitset == null || !this.bitset.equals(other.bitset))) { |
185 | return false; |
186 | } |
187 | return true; |
188 | } |
189 | |
190 | /** |
191 | * Calculates a hash code for this class. |
192 | * @return hash code representing the contents of an instance of this class. |
193 | */ |
194 | @Override |
195 | public int hashCode() { |
196 | int hash = 7; |
197 | hash = 61 * hash + (this.bitset != null ? this.bitset.hashCode() : 0); |
198 | hash = 61 * hash + this.expectedNumberOfFilterElements; |
199 | hash = 61 * hash + this.bitSetSize; |
200 | hash = 61 * hash + this.k; |
201 | return hash; |
202 | } |
203 | |
204 | |
205 | /** |
206 | * Calculates the expected probability of false positives based on |
207 | * the number of expected filter elements and the size of the Bloom filter. |
208 | * <br /><br /> |
209 | * The value returned by this method is the <i>expected</i> rate of false |
210 | * positives, assuming the number of inserted elements equals the number of |
211 | * expected elements. If the number of elements in the Bloom filter is less |
212 | * than the expected value, the true probability of false positives will be lower. |
213 | * |
214 | * @return expected probability of false positives. |
215 | */ |
216 | public double expectedFalsePositiveProbability() { |
217 | return getFalsePositiveProbability(expectedNumberOfFilterElements); |
218 | } |
219 | |
220 | /** |
221 | * Calculate the probability of a false positive given the specified |
222 | * number of inserted elements. |
223 | * |
224 | * @param numberOfElements number of inserted elements. |
225 | * @return probability of a false positive. |
226 | */ |
227 | public double getFalsePositiveProbability(double numberOfElements) { |
228 | // (1 - e^(-k * n / m)) ^ k |
229 | return Math.pow((1 - Math.exp(-k * (double) numberOfElements |
230 | / (double) bitSetSize)), k); |
231 | |
232 | } |
233 | |
234 | /** |
235 | * Get the current probability of a false positive. The probability is calculated from |
236 | * the size of the Bloom filter and the current number of elements added to it. |
237 | * |
238 | * @return probability of false positives. |
239 | */ |
240 | public double getFalsePositiveProbability() { |
241 | return getFalsePositiveProbability(numberOfAddedElements); |
242 | } |
243 | |
244 | |
245 | /** |
246 | * Returns the value chosen for K.<br /> |
247 | * <br /> |
248 | * K is the optimal number of hash functions based on the size |
249 | * of the Bloom filter and the expected number of inserted elements. |
250 | * |
251 | * @return optimal k. |
252 | */ |
253 | public int getK() { |
254 | return k; |
255 | } |
256 | |
257 | /** |
258 | * Sets all bits to false in the Bloom filter. |
259 | */ |
260 | public void clear() { |
261 | bitset.clear(); |
262 | numberOfAddedElements = 0; |
263 | } |
264 | |
265 | /** |
266 | * Adds an object to the Bloom filter. The output from the object's |
267 | * toString() method is used as input to the hash functions. |
268 | * |
269 | * @param element is an element to register in the Bloom filter. |
270 | */ |
271 | public void add(E element) { |
272 | long hash; |
273 | String valString = element.toString(); |
274 | for (int x = 0; x < k; x++) { |
275 | hash = createHash(valString + Integer.toString(x)); |
276 | hash = hash % (long)bitSetSize; |
277 | bitset.set(Math.abs((int)hash), true); |
278 | } |
279 | numberOfAddedElements ++; |
280 | } |
281 | |
282 | /** |
283 | * Adds all elements from a Collection to the Bloom filter. |
284 | * @param c Collection of elements. |
285 | */ |
286 | public void addAll(Collection<? extends E> c) { |
287 | for (E element : c) |
288 | add(element); |
289 | } |
290 | |
291 | /** |
292 | * Returns true if the element could have been inserted into the Bloom filter. |
293 | * Use getFalsePositiveProbability() to calculate the probability of this |
294 | * being correct. |
295 | * |
296 | * @param element element to check. |
297 | * @return true if the element could have been inserted into the Bloom filter. |
298 | */ |
299 | public boolean contains(E element) { |
300 | long hash; |
301 | String valString = element.toString(); |
302 | for (int x = 0; x < k; x++) { |
303 | hash = createHash(valString + Integer.toString(x)); |
304 | hash = hash % (long)bitSetSize; |
305 | if (!bitset.get(Math.abs((int)hash))) |
306 | return false; |
307 | } |
308 | return true; |
309 | } |
310 | |
311 | /** |
312 | * Returns true if all the elements of a Collection could have been inserted |
313 | * into the Bloom filter. Use getFalsePositiveProbability() to calculate the |
314 | * probability of this being correct. |
315 | * @param c elements to check. |
316 | * @return true if all the elements in c could have been inserted into the Bloom filter. |
317 | */ |
318 | public boolean containsAll(Collection<? extends E> c) { |
319 | for (E element : c) |
320 | if (!contains(element)) |
321 | return false; |
322 | return true; |
323 | } |
324 | |
325 | /** |
326 | * Read a single bit from the Bloom filter. |
327 | * @param bit the bit to read. |
328 | * @return true if the bit is set, false if it is not. |
329 | */ |
330 | public boolean getBit(int bit) { |
331 | return bitset.get(bit); |
332 | } |
333 | |
334 | /** |
335 | * Set a single bit in the Bloom filter. |
336 | * @param bit is the bit to set. |
337 | * @param value If true, the bit is set. If false, the bit is cleared. |
338 | */ |
339 | public void setBit(int bit, boolean value) { |
340 | bitset.set(bit, value); |
341 | } |
342 | |
343 | /** |
344 | * Return the bit set used to store the Bloom filter. |
345 | * @return bit set representing the Bloom filter. |
346 | */ |
347 | public BitSet getBitSet() { |
348 | return bitset; |
349 | } |
350 | |
351 | /** |
352 | * Returns the number of bits in the Bloom filter. Use count() to retrieve |
353 | * the number of inserted elements. |
354 | * |
355 | * @return the size of the bitset used by the Bloom filter. |
356 | */ |
357 | public int size() { |
358 | return this.bitSetSize; |
359 | } |
360 | |
361 | /** |
362 | * Returns the number of elements added to the Bloom filter after it |
363 | * was constructed or after clear() was called. |
364 | * |
365 | * @return number of elements added to the Bloom filter. |
366 | */ |
367 | public int count() { |
368 | return this.numberOfAddedElements; |
369 | } |
370 | |
371 | /** |
372 | * Returns the expected number of elements to be inserted into the filter. |
373 | * This value is the same value as the one passed to the constructor. |
374 | * |
375 | * @return expected number of elements. |
376 | */ |
377 | public int getExpectedNumberOfElements() { |
378 | return expectedNumberOfFilterElements; |
379 | } |
380 | |
381 | /** |
382 | * Get expected number of bits per element when the Bloom filter is full. This value is set by the constructor |
383 | * when the Bloom filter is created. See also getBitsPerElement(). |
384 | * |
385 | * @return expected number of bits per element. |
386 | */ |
387 | public double getExpectedBitsPerElement() { |
388 | return this.bitsPerElement; |
389 | } |
390 | |
391 | /** |
392 | * Get actual number of bits per element based on the number of elements that have currently been inserted and the length |
393 | * of the Bloom filter. See also getExpectedBitsPerElement(). |
394 | * |
395 | * @return number of bits per element. |
396 | */ |
397 | public double getBitsPerElement() { |
398 | return this.bitSetSize / (double)numberOfAddedElements; |
399 | } |
400 | } |