一.什么是布隆过滤器
利用内存中的一个长度是m的位数组B,对其中所有位都置0,位数组的初始状态是每个位的值都是0。然后根据k个不同的散列函数,对每个遍历过的URL执行散列,每次散列的结果都是不大于m的一个整数a。根据散列得到的数在位数组B对应的位上置1,也就是让B[a]=1。
每次插入一个爬过的URL,也执行k次散列,只有当全部位都已经置1了才认为这个URL已经遍历过。
二.代码实现(Java)
BloomFilter.java
package com.rumo.foundation;import java.io.FileInputStream;import java.io.FileOutputStream;import java.io.ObjectInputStream;import java.io.ObjectOutputStream;import java.io.Serializable;import java.util.BitSet;import java.util.concurrent.atomic.AtomicInteger;public class BloomFilter implements Serializable{ private static final long serialVersionUID = 923464509229483197L; private final int[] seeds; private final int size; private final BitSet bitset; private final MisJudgeRate rate; private final AtomicInteger useCount = new AtomicInteger(0); private final Double autoClearRate; /** * 默认中等程序的误判率:MisjudgmentRate.MIDDLE 以及不自动清空数据 * @param dataCount */ public BloomFilter(int dataCount){ this(MisJudgeRate.MIDDLE, dataCount, null); } /** * * @param rate 枚举类型的误判率 * @param dataCount 预期处理的数据规模 * @param autoClearRate * 自动清空过滤器内部信息的使用比率,传null则表示不会自动清理, * 当过滤器使用率达到100%时,则无论传入什么数据,都会认为在数据已经存在, * 当希望过滤器使用率达到80%时自动清空重新使用,则传入0.8 */ public BloomFilter(MisJudgeRate rate,int dataCount,Double autoClearRate){ long bitSize = rate.seeds.length*dataCount; if(bitSize<0 || bitSize>Integer.MAX_VALUE) throw new RuntimeException("DataCount too large,Overflow!"); this.rate = rate; this.seeds = rate.getSeeds(); this.size = (int)bitSize; this.bitset = new BitSet(size); this.autoClearRate = autoClearRate; } /** * 存在返回true,不存在记录返回false * @param data * @return */ public boolean addIfNotExist(String data){ checkNeedClear();// 检查是否需要清空重新使用 int[] indexs = new int[this.seeds.length]; boolean exist = true;// assign it exist int index; for (int i = 0; i < this.seeds.length; i++) { index = hash(data, this.seeds[i]); indexs[i] = index; if(exist){ if(!bitset.get(index)){ // 有一个不存在则认为第一次出现 exist = false; for (int j = 0; j <= i; j++) { setTrue(indexs[j]); } } }else{ setTrue(index); } } return exist; } public void setTrue(int index){ useCount.incrementAndGet(); bitset.set(index,true); } private void checkNeedClear(){ if(this.autoClearRate!=null){ if(getUseRate() >= this.autoClearRate){ synchronized (this) { if(getUseRate() >= this.autoClearRate){ bitset.clear(); useCount.set(0); } } } } } private int hash(String data,int seeds){ char[] value = data.toCharArray(); int hash = 0; if(value.length>0){ for (int i = 0; i < value.length; i++) { hash = i*hash + value[i]; } } hash = hash*seeds%this.size; return Math.abs(hash); } public void add(String data){ checkNeedClear(); for (int i = 0; i < this.seeds.length; i++) { int index = hash(data, this.seeds[i]); setTrue(index); } } public boolean check(String data){ for (int i = 0; i < this.seeds.length; i++) { int index = hash(data, this.seeds[i]); if(!this.bitset.get(index)) return false; } return true; } public double getUseRate(){ return (double)useCount.intValue() / (double)size; } /** * 清空过滤器中记录信息 */ public void clear(){ this.useCount.set(0); this.bitset.clear(); } public void saveFilter(String path){ try { ObjectOutputStream oos = new ObjectOutputStream(new FileOutputStream(path)); oos.writeObject(this); } catch (Exception e) { throw new RuntimeException("Save FilterObject to file Failure!"); } } public static BloomFilter readFilter(String path){ try { ObjectInputStream ois = new ObjectInputStream(new FileInputStream(path)); return (BloomFilter) ois.readObject(); } catch (Exception e) { throw new RuntimeException("Read Filter from ObjectFile Failure!"); } } public MisJudgeRate getRate() { return rate; } public enum MisJudgeRate{ /** * 分配4个位,误判率约为:0.14689159766308 */ VERY_SMALL(new int[]{2,3,5,7}), /** * 分配8个位,误判率约为:0.02157714146322 */ SMALL(new int[]{2,3,5,7,11,13,17,19}), /** * 分配16个位,误判率约为:0.00046557303372 */ MIDDLE(new int[]{2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53}), /** * 分配32个位,误判率约为:0.00000021167340 */ HIGH(new int[]{2,3,5,7,11,13,17,19,23,29,31,37,41,43,47,53, 59,61,67,71,73,79,83,89,97,101,193,107,109,113,127,131}); private int[] seeds; private MisJudgeRate(int[] seeds){ this.seeds = seeds; } public int[] getSeeds(){ return seeds; } public void setSeeds(int[] seeds){ this.seeds = seeds; } } public static void main(String[] args) { BloomFilter urlBloomFilter = new BloomFilter(5); System.out.println(urlBloomFilter.addIfNotExist("www.rumoss.cn")); System.out.println(urlBloomFilter.addIfNotExist("www.sohu.com")); System.out.println(urlBloomFilter.addIfNotExist("www.sina.com")); System.out.println(urlBloomFilter.getUseRate()); System.out.println(urlBloomFilter.addIfNotExist("www.qq.cn")); System.out.println(urlBloomFilter.getUseRate()); String savePath = "D:\\bloomFilter.obj"; urlBloomFilter.saveFilter(savePath); urlBloomFilter = readFilter(savePath); System.out.println(urlBloomFilter.getUseRate()); System.out.println(urlBloomFilter.addIfNotExist("www.qq.cn")); } }
测试结果:
falsefalsetrue0.4false0.60.6true