mirror of
https://github.com/metabolist/metatext.git
synced 2024-11-25 17:50:59 +00:00
Size bloom filter by bytes instead of bits
This commit is contained in:
parent
e731bdb1e3
commit
c7f6972fdb
4 changed files with 44 additions and 61 deletions
|
@ -5,53 +5,62 @@
|
||||||
import Foundation
|
import Foundation
|
||||||
|
|
||||||
struct BitArray {
|
struct BitArray {
|
||||||
let count: Int
|
private var bytes: [UInt8]
|
||||||
|
|
||||||
private var items: [UInt8]
|
init(byteCount: Int) {
|
||||||
|
self.bytes = [UInt8](repeating: 0, count: byteCount)
|
||||||
init(count: Int) {
|
|
||||||
self.count = count
|
|
||||||
|
|
||||||
var (byteCount, bitRemainder) = count.quotientAndRemainder(dividingBy: Self.bitsInByte)
|
|
||||||
|
|
||||||
byteCount += bitRemainder > 0 ? 1 : 0
|
|
||||||
|
|
||||||
items = [UInt8](repeating: 0, count: byteCount)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
init(data: Data, count: Int) {
|
init(data: Data) {
|
||||||
self.items = Array(data)
|
bytes = Array(data)
|
||||||
self.count = count
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
extension BitArray {
|
extension BitArray {
|
||||||
var data: Data { Data(items) }
|
var bitCount: Int { bytes.count * Self.bitsInByte }
|
||||||
|
|
||||||
subscript(index: Int) -> Bool {
|
subscript(index: Int) -> Bool {
|
||||||
get {
|
get {
|
||||||
let (byteCount, bitPosition) = index.quotientAndRemainder(dividingBy: Self.bitsInByte)
|
let (byteIndex, bitIndex) = Self.byteAndBitIndices(index: index)
|
||||||
|
|
||||||
return items[byteCount] & mask(index: bitPosition) > 0
|
return bytes[byteIndex] & Self.mask(bitIndex: bitIndex) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
set {
|
set {
|
||||||
let (byteCount, bitPosition) = index.quotientAndRemainder(dividingBy: Self.bitsInByte)
|
let (byteIndex, bitIndex) = Self.byteAndBitIndices(index: index)
|
||||||
|
|
||||||
if newValue {
|
if newValue {
|
||||||
items[byteCount] |= mask(index: bitPosition)
|
bytes[byteIndex] |= Self.mask(bitIndex: bitIndex)
|
||||||
} else {
|
} else {
|
||||||
items[byteCount] &= ~mask(index: bitPosition)
|
bytes[byteIndex] &= ~Self.mask(bitIndex: bitIndex)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
extension BitArray: Codable {
|
||||||
|
init(from decoder: Decoder) throws {
|
||||||
|
let container = try decoder.singleValueContainer()
|
||||||
|
|
||||||
|
bytes = Array(try container.decode(Data.self))
|
||||||
|
}
|
||||||
|
|
||||||
|
func encode(to encoder: Encoder) throws {
|
||||||
|
var container = encoder.singleValueContainer()
|
||||||
|
|
||||||
|
try container.encode(Data(bytes))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
private extension BitArray {
|
private extension BitArray {
|
||||||
static let bitsInByte = 8
|
static let bitsInByte = 8
|
||||||
|
|
||||||
func mask(index: Int) -> UInt8 {
|
static func byteAndBitIndices(index: Int) -> (Int, Int) {
|
||||||
switch index {
|
index.quotientAndRemainder(dividingBy: bitsInByte)
|
||||||
|
}
|
||||||
|
|
||||||
|
static func mask(bitIndex: Int) -> UInt8 {
|
||||||
|
switch bitIndex {
|
||||||
case 0: return 0b00000001
|
case 0: return 0b00000001
|
||||||
case 1: return 0b00000010
|
case 1: return 0b00000010
|
||||||
case 2: return 0b00000100
|
case 2: return 0b00000100
|
||||||
|
@ -61,7 +70,7 @@ private extension BitArray {
|
||||||
case 6: return 0b01000000
|
case 6: return 0b01000000
|
||||||
case 7: return 0b10000000
|
case 7: return 0b10000000
|
||||||
default:
|
default:
|
||||||
fatalError("Invalid index: \(index)")
|
fatalError("Invalid bit index: \(bitIndex)")
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -4,59 +4,33 @@ import Foundation
|
||||||
|
|
||||||
// https://en.wikipedia.org/wiki/Bloom_filter
|
// https://en.wikipedia.org/wiki/Bloom_filter
|
||||||
// https://khanlou.com/2018/09/bloom-filters/
|
// https://khanlou.com/2018/09/bloom-filters/
|
||||||
// This implementation uses deterministic hashing functions so it can be serialized / deserialized
|
// This implementation uses deterministic hashing functions so it can conform to Codable
|
||||||
|
|
||||||
public struct BloomFilter<T: DeterministicallyHashable> {
|
public struct BloomFilter<T: DeterministicallyHashable>: Codable {
|
||||||
public let hashers: [DeterministicHasher]
|
public let hashers: [DeterministicHasher]
|
||||||
public let bitCount: Int
|
|
||||||
|
|
||||||
private var bitArray: BitArray
|
private var data: BitArray
|
||||||
|
|
||||||
public init(hashers: [DeterministicHasher], bits: Int) {
|
public init(hashers: [DeterministicHasher], byteCount: Int) {
|
||||||
self.hashers = hashers
|
self.hashers = hashers
|
||||||
bitCount = bits
|
data = BitArray(byteCount: byteCount)
|
||||||
bitArray = BitArray(count: bits)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public extension BloomFilter {
|
public extension BloomFilter {
|
||||||
mutating func insert(_ newMember: T) {
|
mutating func insert(_ newMember: T) {
|
||||||
for index in indices(newMember) {
|
for index in indices(newMember) {
|
||||||
bitArray[index] = true
|
data[index] = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
func contains(_ member: T) -> Bool {
|
func contains(_ member: T) -> Bool {
|
||||||
indices(member).map { bitArray[$0] }.allSatisfy { $0 }
|
indices(member).allSatisfy { data[$0] }
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
extension BloomFilter: Codable {
|
|
||||||
private enum CodingKeys: String, CodingKey {
|
|
||||||
case hashers
|
|
||||||
case bits
|
|
||||||
case data
|
|
||||||
}
|
|
||||||
|
|
||||||
public init(from decoder: Decoder) throws {
|
|
||||||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
|
||||||
|
|
||||||
hashers = try container.decode([DeterministicHasher].self, forKey: .hashers)
|
|
||||||
bitCount = try container.decode(Int.self, forKey: .bits)
|
|
||||||
bitArray = BitArray(data: try container.decode(Data.self, forKey: .data), count: bitCount)
|
|
||||||
}
|
|
||||||
|
|
||||||
public func encode(to encoder: Encoder) throws {
|
|
||||||
var container = encoder.container(keyedBy: CodingKeys.self)
|
|
||||||
|
|
||||||
try container.encode(hashers, forKey: .hashers)
|
|
||||||
try container.encode(bitCount, forKey: .bits)
|
|
||||||
try container.encode(bitArray.data, forKey: .data)
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private extension BloomFilter {
|
private extension BloomFilter {
|
||||||
func indices(_ member: T) -> [Int] {
|
func indices(_ member: T) -> [Int] {
|
||||||
hashers.map { abs($0.apply(member)) % bitCount }
|
hashers.map { abs($0.apply(member)) % data.bitCount }
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -15,7 +15,7 @@ extension DeterministicHasher {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://gist.github.com/kharrison/2355182ac03b481921073c5cf6d77a73
|
// http://www.cse.yorku.ca/~oz/hash.html
|
||||||
|
|
||||||
private extension DeterministicHasher {
|
private extension DeterministicHasher {
|
||||||
var initial: Int {
|
var initial: Int {
|
||||||
|
|
|
@ -3,7 +3,7 @@ import XCTest
|
||||||
|
|
||||||
final class CodableBloomFilterTests: XCTestCase {
|
final class CodableBloomFilterTests: XCTestCase {
|
||||||
func testContains() {
|
func testContains() {
|
||||||
var sut = BloomFilter<String>(hashers: [.djb2, .sdbm], bits: 1024)
|
var sut = BloomFilter<String>(hashers: [.djb2, .sdbm], byteCount: 128)
|
||||||
|
|
||||||
sut.insert("lol")
|
sut.insert("lol")
|
||||||
sut.insert("ok")
|
sut.insert("ok")
|
||||||
|
@ -15,8 +15,8 @@ final class CodableBloomFilterTests: XCTestCase {
|
||||||
}
|
}
|
||||||
|
|
||||||
func testCoding() throws {
|
func testCoding() throws {
|
||||||
var sut = BloomFilter<String>(hashers: [.djb2, .sdbm], bits: 64)
|
var sut = BloomFilter<String>(hashers: [.djb2, .sdbm], byteCount: 8)
|
||||||
let expectedSerialization = Data(#"{"bits":64,"data":"ABAAAAACAJA=","hashers":["djb2","sdbm"]}"#.utf8)
|
let expectedSerialization = Data(#"{"data":"ABAAAAACAJA=","hashers":["djb2","sdbm"]}"#.utf8)
|
||||||
|
|
||||||
sut.insert("lol")
|
sut.insert("lol")
|
||||||
sut.insert("ok")
|
sut.insert("ok")
|
||||||
|
|
Loading…
Reference in a new issue