Size bloom filter by bytes instead of bits

This commit is contained in:
Justin Mazzocchi 2020-09-05 19:08:30 -07:00
parent e731bdb1e3
commit c7f6972fdb
No known key found for this signature in database
GPG key ID: E223E6937AAFB01C
4 changed files with 44 additions and 61 deletions

View file

@ -5,53 +5,62 @@
import Foundation import Foundation
struct BitArray { struct BitArray {
let count: Int private var bytes: [UInt8]
private var items: [UInt8] init(byteCount: Int) {
self.bytes = [UInt8](repeating: 0, count: byteCount)
init(count: Int) {
self.count = count
var (byteCount, bitRemainder) = count.quotientAndRemainder(dividingBy: Self.bitsInByte)
byteCount += bitRemainder > 0 ? 1 : 0
items = [UInt8](repeating: 0, count: byteCount)
} }
init(data: Data, count: Int) { init(data: Data) {
self.items = Array(data) bytes = Array(data)
self.count = count
} }
} }
extension BitArray { extension BitArray {
var data: Data { Data(items) } var bitCount: Int { bytes.count * Self.bitsInByte }
subscript(index: Int) -> Bool { subscript(index: Int) -> Bool {
get { get {
let (byteCount, bitPosition) = index.quotientAndRemainder(dividingBy: Self.bitsInByte) let (byteIndex, bitIndex) = Self.byteAndBitIndices(index: index)
return items[byteCount] & mask(index: bitPosition) > 0 return bytes[byteIndex] & Self.mask(bitIndex: bitIndex) > 0
} }
set { set {
let (byteCount, bitPosition) = index.quotientAndRemainder(dividingBy: Self.bitsInByte) let (byteIndex, bitIndex) = Self.byteAndBitIndices(index: index)
if newValue { if newValue {
items[byteCount] |= mask(index: bitPosition) bytes[byteIndex] |= Self.mask(bitIndex: bitIndex)
} else { } else {
items[byteCount] &= ~mask(index: bitPosition) bytes[byteIndex] &= ~Self.mask(bitIndex: bitIndex)
} }
} }
} }
} }
extension BitArray: Codable {
init(from decoder: Decoder) throws {
let container = try decoder.singleValueContainer()
bytes = Array(try container.decode(Data.self))
}
func encode(to encoder: Encoder) throws {
var container = encoder.singleValueContainer()
try container.encode(Data(bytes))
}
}
private extension BitArray { private extension BitArray {
static let bitsInByte = 8 static let bitsInByte = 8
func mask(index: Int) -> UInt8 { static func byteAndBitIndices(index: Int) -> (Int, Int) {
switch index { index.quotientAndRemainder(dividingBy: bitsInByte)
}
static func mask(bitIndex: Int) -> UInt8 {
switch bitIndex {
case 0: return 0b00000001 case 0: return 0b00000001
case 1: return 0b00000010 case 1: return 0b00000010
case 2: return 0b00000100 case 2: return 0b00000100
@ -61,7 +70,7 @@ private extension BitArray {
case 6: return 0b01000000 case 6: return 0b01000000
case 7: return 0b10000000 case 7: return 0b10000000
default: default:
fatalError("Invalid index: \(index)") fatalError("Invalid bit index: \(bitIndex)")
} }
} }
} }

View file

@ -4,59 +4,33 @@ import Foundation
// https://en.wikipedia.org/wiki/Bloom_filter // https://en.wikipedia.org/wiki/Bloom_filter
// https://khanlou.com/2018/09/bloom-filters/ // https://khanlou.com/2018/09/bloom-filters/
// This implementation uses deterministic hashing functions so it can be serialized / deserialized // This implementation uses deterministic hashing functions so it can conform to Codable
public struct BloomFilter<T: DeterministicallyHashable> { public struct BloomFilter<T: DeterministicallyHashable>: Codable {
public let hashers: [DeterministicHasher] public let hashers: [DeterministicHasher]
public let bitCount: Int
private var bitArray: BitArray private var data: BitArray
public init(hashers: [DeterministicHasher], bits: Int) { public init(hashers: [DeterministicHasher], byteCount: Int) {
self.hashers = hashers self.hashers = hashers
bitCount = bits data = BitArray(byteCount: byteCount)
bitArray = BitArray(count: bits)
} }
} }
public extension BloomFilter { public extension BloomFilter {
mutating func insert(_ newMember: T) { mutating func insert(_ newMember: T) {
for index in indices(newMember) { for index in indices(newMember) {
bitArray[index] = true data[index] = true
} }
} }
func contains(_ member: T) -> Bool { func contains(_ member: T) -> Bool {
indices(member).map { bitArray[$0] }.allSatisfy { $0 } indices(member).allSatisfy { data[$0] }
}
}
extension BloomFilter: Codable {
private enum CodingKeys: String, CodingKey {
case hashers
case bits
case data
}
public init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
hashers = try container.decode([DeterministicHasher].self, forKey: .hashers)
bitCount = try container.decode(Int.self, forKey: .bits)
bitArray = BitArray(data: try container.decode(Data.self, forKey: .data), count: bitCount)
}
public func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
try container.encode(hashers, forKey: .hashers)
try container.encode(bitCount, forKey: .bits)
try container.encode(bitArray.data, forKey: .data)
} }
} }
private extension BloomFilter { private extension BloomFilter {
func indices(_ member: T) -> [Int] { func indices(_ member: T) -> [Int] {
hashers.map { abs($0.apply(member)) % bitCount } hashers.map { abs($0.apply(member)) % data.bitCount }
} }
} }

View file

@ -15,7 +15,7 @@ extension DeterministicHasher {
} }
} }
// https://gist.github.com/kharrison/2355182ac03b481921073c5cf6d77a73 // http://www.cse.yorku.ca/~oz/hash.html
private extension DeterministicHasher { private extension DeterministicHasher {
var initial: Int { var initial: Int {

View file

@ -3,7 +3,7 @@ import XCTest
final class CodableBloomFilterTests: XCTestCase { final class CodableBloomFilterTests: XCTestCase {
func testContains() { func testContains() {
var sut = BloomFilter<String>(hashers: [.djb2, .sdbm], bits: 1024) var sut = BloomFilter<String>(hashers: [.djb2, .sdbm], byteCount: 128)
sut.insert("lol") sut.insert("lol")
sut.insert("ok") sut.insert("ok")
@ -15,8 +15,8 @@ final class CodableBloomFilterTests: XCTestCase {
} }
func testCoding() throws { func testCoding() throws {
var sut = BloomFilter<String>(hashers: [.djb2, .sdbm], bits: 64) var sut = BloomFilter<String>(hashers: [.djb2, .sdbm], byteCount: 8)
let expectedSerialization = Data(#"{"bits":64,"data":"ABAAAAACAJA=","hashers":["djb2","sdbm"]}"#.utf8) let expectedSerialization = Data(#"{"data":"ABAAAAACAJA=","hashers":["djb2","sdbm"]}"#.utf8)
sut.insert("lol") sut.insert("lol")
sut.insert("ok") sut.insert("ok")