From c7f6972fdb1691236c11e9b0940b40d833afdbc0 Mon Sep 17 00:00:00 2001 From: Justin Mazzocchi <2831158+jzzocc@users.noreply.github.com> Date: Sat, 5 Sep 2020 19:08:30 -0700 Subject: [PATCH] Size bloom filter by bytes instead of bits --- .../Sources/CodableBloomFilter/BitArray.swift | 55 +++++++++++-------- .../CodableBloomFilter/BloomFilter.swift | 42 +++----------- .../DeterministicHasher.swift | 2 +- .../CodableBloomFilterTests.swift | 6 +- 4 files changed, 44 insertions(+), 61 deletions(-) diff --git a/CodableBloomFilter/Sources/CodableBloomFilter/BitArray.swift b/CodableBloomFilter/Sources/CodableBloomFilter/BitArray.swift index f5e11f6..a589a3f 100644 --- a/CodableBloomFilter/Sources/CodableBloomFilter/BitArray.swift +++ b/CodableBloomFilter/Sources/CodableBloomFilter/BitArray.swift @@ -5,53 +5,62 @@ import Foundation struct BitArray { - let count: Int + private var bytes: [UInt8] - private var items: [UInt8] - - init(count: Int) { - self.count = count - - var (byteCount, bitRemainder) = count.quotientAndRemainder(dividingBy: Self.bitsInByte) - - byteCount += bitRemainder > 0 ? 1 : 0 - - items = [UInt8](repeating: 0, count: byteCount) + init(byteCount: Int) { + self.bytes = [UInt8](repeating: 0, count: byteCount) } - init(data: Data, count: Int) { - self.items = Array(data) - self.count = count + init(data: Data) { + bytes = Array(data) } } extension BitArray { - var data: Data { Data(items) } + var bitCount: Int { bytes.count * Self.bitsInByte } subscript(index: Int) -> Bool { get { - let (byteCount, bitPosition) = index.quotientAndRemainder(dividingBy: Self.bitsInByte) + let (byteIndex, bitIndex) = Self.byteAndBitIndices(index: index) - return items[byteCount] & mask(index: bitPosition) > 0 + return bytes[byteIndex] & Self.mask(bitIndex: bitIndex) > 0 } set { - let (byteCount, bitPosition) = index.quotientAndRemainder(dividingBy: Self.bitsInByte) + let (byteIndex, bitIndex) = Self.byteAndBitIndices(index: index) if newValue { - items[byteCount] |= mask(index: bitPosition) + bytes[byteIndex] |= Self.mask(bitIndex: bitIndex) } else { - items[byteCount] &= ~mask(index: bitPosition) + bytes[byteIndex] &= ~Self.mask(bitIndex: bitIndex) } } } } +extension BitArray: Codable { + init(from decoder: Decoder) throws { + let container = try decoder.singleValueContainer() + + bytes = Array(try container.decode(Data.self)) + } + + func encode(to encoder: Encoder) throws { + var container = encoder.singleValueContainer() + + try container.encode(Data(bytes)) + } +} + private extension BitArray { static let bitsInByte = 8 - func mask(index: Int) -> UInt8 { - switch index { + static func byteAndBitIndices(index: Int) -> (Int, Int) { + index.quotientAndRemainder(dividingBy: bitsInByte) + } + + static func mask(bitIndex: Int) -> UInt8 { + switch bitIndex { case 0: return 0b00000001 case 1: return 0b00000010 case 2: return 0b00000100 @@ -61,7 +70,7 @@ private extension BitArray { case 6: return 0b01000000 case 7: return 0b10000000 default: - fatalError("Invalid index: \(index)") + fatalError("Invalid bit index: \(bitIndex)") } } } diff --git a/CodableBloomFilter/Sources/CodableBloomFilter/BloomFilter.swift b/CodableBloomFilter/Sources/CodableBloomFilter/BloomFilter.swift index 3e1abaf..030f36a 100644 --- a/CodableBloomFilter/Sources/CodableBloomFilter/BloomFilter.swift +++ b/CodableBloomFilter/Sources/CodableBloomFilter/BloomFilter.swift @@ -4,59 +4,33 @@ import Foundation // https://en.wikipedia.org/wiki/Bloom_filter // https://khanlou.com/2018/09/bloom-filters/ -// This implementation uses deterministic hashing functions so it can be serialized / deserialized +// This implementation uses deterministic hashing functions so it can conform to Codable -public struct BloomFilter { +public struct BloomFilter: Codable { public let hashers: [DeterministicHasher] - public let bitCount: Int - private var bitArray: BitArray + private var data: BitArray - public init(hashers: [DeterministicHasher], bits: Int) { + public init(hashers: [DeterministicHasher], byteCount: Int) { self.hashers = hashers - bitCount = bits - bitArray = BitArray(count: bits) + data = BitArray(byteCount: byteCount) } } public extension BloomFilter { mutating func insert(_ newMember: T) { for index in indices(newMember) { - bitArray[index] = true + data[index] = true } } func contains(_ member: T) -> Bool { - indices(member).map { bitArray[$0] }.allSatisfy { $0 } - } -} - -extension BloomFilter: Codable { - private enum CodingKeys: String, CodingKey { - case hashers - case bits - case data - } - - public init(from decoder: Decoder) throws { - let container = try decoder.container(keyedBy: CodingKeys.self) - - hashers = try container.decode([DeterministicHasher].self, forKey: .hashers) - bitCount = try container.decode(Int.self, forKey: .bits) - bitArray = BitArray(data: try container.decode(Data.self, forKey: .data), count: bitCount) - } - - public func encode(to encoder: Encoder) throws { - var container = encoder.container(keyedBy: CodingKeys.self) - - try container.encode(hashers, forKey: .hashers) - try container.encode(bitCount, forKey: .bits) - try container.encode(bitArray.data, forKey: .data) + indices(member).allSatisfy { data[$0] } } } private extension BloomFilter { func indices(_ member: T) -> [Int] { - hashers.map { abs($0.apply(member)) % bitCount } + hashers.map { abs($0.apply(member)) % data.bitCount } } } diff --git a/CodableBloomFilter/Sources/CodableBloomFilter/DeterministicHasher.swift b/CodableBloomFilter/Sources/CodableBloomFilter/DeterministicHasher.swift index 0121b84..650bc73 100644 --- a/CodableBloomFilter/Sources/CodableBloomFilter/DeterministicHasher.swift +++ b/CodableBloomFilter/Sources/CodableBloomFilter/DeterministicHasher.swift @@ -15,7 +15,7 @@ extension DeterministicHasher { } } -// https://gist.github.com/kharrison/2355182ac03b481921073c5cf6d77a73 +// http://www.cse.yorku.ca/~oz/hash.html private extension DeterministicHasher { var initial: Int { diff --git a/CodableBloomFilter/Tests/CodableBloomFilterTests/CodableBloomFilterTests.swift b/CodableBloomFilter/Tests/CodableBloomFilterTests/CodableBloomFilterTests.swift index 9d8105f..b6801e2 100644 --- a/CodableBloomFilter/Tests/CodableBloomFilterTests/CodableBloomFilterTests.swift +++ b/CodableBloomFilter/Tests/CodableBloomFilterTests/CodableBloomFilterTests.swift @@ -3,7 +3,7 @@ import XCTest final class CodableBloomFilterTests: XCTestCase { func testContains() { - var sut = BloomFilter(hashers: [.djb2, .sdbm], bits: 1024) + var sut = BloomFilter(hashers: [.djb2, .sdbm], byteCount: 128) sut.insert("lol") sut.insert("ok") @@ -15,8 +15,8 @@ final class CodableBloomFilterTests: XCTestCase { } func testCoding() throws { - var sut = BloomFilter(hashers: [.djb2, .sdbm], bits: 64) - let expectedSerialization = Data(#"{"bits":64,"data":"ABAAAAACAJA=","hashers":["djb2","sdbm"]}"#.utf8) + var sut = BloomFilter(hashers: [.djb2, .sdbm], byteCount: 8) + let expectedSerialization = Data(#"{"data":"ABAAAAACAJA=","hashers":["djb2","sdbm"]}"#.utf8) sut.insert("lol") sut.insert("ok")