Make bloom filter codable

This commit is contained in:
Justin Mazzocchi 2020-09-05 15:21:22 -07:00
parent fd71f48a59
commit 321cea3ccd
No known key found for this signature in database
GPG key ID: E223E6937AAFB01C
9 changed files with 154 additions and 114 deletions

View file

@ -0,0 +1,21 @@
// swift-tools-version:5.3
import PackageDescription
let package = Package(
name: "CodableBloomFilter",
products: [
.library(
name: "CodableBloomFilter",
targets: ["CodableBloomFilter"])
],
dependencies: [],
targets: [
.target(
name: "CodableBloomFilter",
dependencies: []),
.testTarget(
name: "CodableBloomFilterTests",
dependencies: ["CodableBloomFilter"])
]
)

View file

@ -0,0 +1,92 @@
// Copyright © 2020 Metabolist. All rights reserved.
import Foundation
// https://en.wikipedia.org/wiki/Bloom_filter
// https://khanlou.com/2018/09/bloom-filters/
// This implementation uses deterministic hashing functions so it can be serialized / deserialized
struct BloomFilter {
let hashes: [Hash]
let bitCount: Int
private var bits: Bits
init(hashes: [Hash], bitCount: Int) {
self.hashes = hashes
self.bitCount = bitCount
bits = Bits(count: bitCount)
}
}
extension BloomFilter {
enum Hash: String, Codable {
case djb2
case sdbm
}
mutating func insert(_ newMember: String) {
for index in indices(newMember) {
bits[index] = true
}
}
func contains(_ member: String) -> Bool {
indices(member).map { bits[$0] }.allSatisfy { $0 }
}
}
extension BloomFilter: Codable {
enum CodingKeys: String, CodingKey {
case hashes
case bitCount
case data
}
init(from decoder: Decoder) throws {
let container = try decoder.container(keyedBy: CodingKeys.self)
let data = try container.decode(Data.self, forKey: .data)
hashes = try container.decode([Hash].self, forKey: .hashes)
bitCount = try container.decode(Int.self, forKey: .bitCount)
bits = Bits(bytes: Array(data), count: bitCount)
}
func encode(to encoder: Encoder) throws {
var container = encoder.container(keyedBy: CodingKeys.self)
try container.encode(hashes, forKey: .hashes)
try container.encode(bitCount, forKey: .bitCount)
try container.encode(bits.data, forKey: .data)
}
}
private extension BloomFilter {
func indices(_ string: String) -> [Int] {
hashes.map { abs($0.apply(string)) % bitCount }
}
}
// https://gist.github.com/kharrison/2355182ac03b481921073c5cf6d77a73
private extension BloomFilter.Hash {
func apply(_ string: String) -> Int {
string.unicodeScalars.map(\.value).reduce(initial, then)
}
var initial: Int {
switch self {
case .djb2: return 5381
case .sdbm: return 0
}
}
func then(result: Int, next: UInt32) -> Int {
switch self {
case .djb2:
return (result << 5) &+ result &+ Int(next)
case .sdbm:
return Int(next) &+ (result << 6) &+ (result << 16) - result
}
}
}

View file

@ -0,0 +1,39 @@
@testable import CodableBloomFilter
import XCTest
final class CodableBloomFilterTests: XCTestCase {
func testContains() {
var sut = BloomFilter(hashes: [.djb2, .sdbm], bitCount: 1024)
sut.insert("lol")
sut.insert("ok")
XCTAssert(sut.contains("lol"))
XCTAssert(sut.contains("ok"))
XCTAssertFalse(sut.contains("wtf"))
XCTAssertFalse(sut.contains("no"))
}
func testCoding() throws {
var sut = BloomFilter(hashes: [.djb2, .sdbm], bitCount: 64)
let expectedSerialization = Data(#"{"bitCount":64,"data":"ABAAAAACAJA=","hashes":["djb2","sdbm"]}"#.utf8)
sut.insert("lol")
sut.insert("ok")
let encoder = JSONEncoder()
encoder.outputFormatting = .sortedKeys
let serialization = try encoder.encode(sut)
XCTAssertEqual(serialization, expectedSerialization)
let decoded = try JSONDecoder().decode(BloomFilter.self, from: serialization)
XCTAssert(decoded.contains("lol"))
XCTAssert(decoded.contains("ok"))
XCTAssertFalse(decoded.contains("wtf"))
XCTAssertFalse(decoded.contains("no"))
}
}

View file

@ -86,7 +86,6 @@
D047FA8C24C3E21200AF17C5 /* Metatext.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Metatext.app; sourceTree = BUILT_PRODUCTS_DIR; };
D0666A2124C677B400F3F04B /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
D0666A2524C677B400F3F04B /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
D07E164425037264008B10D0 /* SerializableBloomFilter */ = {isa = PBXFileReference; lastKnownFileType = folder; path = SerializableBloomFilter; sourceTree = "<group>"; };
D085C3BB25008DEC008A6C5E /* DB */ = {isa = PBXFileReference; lastKnownFileType = folder; path = DB; sourceTree = "<group>"; };
D0BDF66524FD7A6400C7FA1C /* ServiceLayer */ = {isa = PBXFileReference; lastKnownFileType = folder; path = ServiceLayer; sourceTree = "<group>"; };
D0BEB1F224F8EE8C001B0F04 /* AttachmentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AttachmentView.swift; sourceTree = "<group>"; };
@ -122,6 +121,7 @@
D0C7D46E24F76169001EBDBB /* KingfisherOptionsInfo+Extensions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "KingfisherOptionsInfo+Extensions.swift"; sourceTree = "<group>"; };
D0C7D46F24F76169001EBDBB /* View+Extensions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "View+Extensions.swift"; sourceTree = "<group>"; };
D0C7D47124F76169001EBDBB /* Data+Extensions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Data+Extensions.swift"; sourceTree = "<group>"; };
D0D7C013250440610039AD6F /* CodableBloomFilter */ = {isa = PBXFileReference; lastKnownFileType = folder; path = CodableBloomFilter; sourceTree = "<group>"; };
D0E0F1E424FC49FC002C04BF /* Mastodon */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Mastodon; sourceTree = "<group>"; };
D0E2C1CF24FD8BA400854680 /* ViewModels */ = {isa = PBXFileReference; lastKnownFileType = folder; path = ViewModels; sourceTree = "<group>"; };
D0E5361924E3EB4D00FB1CE1 /* Notification Service Extension.appex */ = {isa = PBXFileReference; explicitFileType = "wrapper.app-extension"; includeInIndex = 0; path = "Notification Service Extension.appex"; sourceTree = BUILT_PRODUCTS_DIR; };
@ -182,8 +182,8 @@
D047FA7F24C3E21000AF17C5 = {
isa = PBXGroup;
children = (
D07E164425037264008B10D0 /* SerializableBloomFilter */,
D0C7D45224F76169001EBDBB /* Assets.xcassets */,
D0D7C013250440610039AD6F /* CodableBloomFilter */,
D085C3BB25008DEC008A6C5E /* DB */,
D0C7D46824F76169001EBDBB /* Extensions */,
D0666A7924C7745A00F3F04B /* Frameworks */,

View file

@ -1,25 +0,0 @@
// swift-tools-version:5.3
import PackageDescription
let package = Package(
name: "SerializableBloomFilter",
platforms: [
.iOS(.v14),
.macOS(.v11)
],
products: [
.library(
name: "SerializableBloomFilter",
targets: ["SerializableBloomFilter"])
],
dependencies: [],
targets: [
.target(
name: "SerializableBloomFilter",
dependencies: []),
.testTarget(
name: "SerializableBloomFilterTests",
dependencies: ["SerializableBloomFilter"])
]
)

View file

@ -1,56 +0,0 @@
// Copyright © 2020 Metabolist. All rights reserved.
import Foundation
// https://en.wikipedia.org/wiki/Bloom_filter
// https://khanlou.com/2018/09/bloom-filters/
// This implementation uses deterministic hashing functions so it can be serialized / deserialized
struct SerializableBloomFilter {
private var items: Bits
init() {
items = Bits(count: Self.itemCount)
}
init(serialization: Data) throws {
items = Bits(bytes: Array(serialization), count: Self.itemCount)
}
}
extension SerializableBloomFilter {
var serialization: Data { items.data }
mutating func insert(_ newMember: String) {
for index in Self.indices(newMember) {
items[index] = true
}
}
func contains(_ member: String) -> Bool {
Self.indices(member).map { items[$0] }.allSatisfy { $0 }
}
}
private extension SerializableBloomFilter {
static let itemCount = 1024
static let hashFunctions = [djb2, sdbm]
static func indices(_ string: String) -> [Int] {
hashFunctions.map { abs($0(string)) % itemCount }
}
}
// https://gist.github.com/kharrison/2355182ac03b481921073c5cf6d77a73
private func djb2(_ string: String) -> Int {
string.unicodeScalars.map(\.value).reduce(5381) {
($0 << 5) &+ $0 &+ Int($1)
}
}
private func sdbm(_ string: String) -> Int {
string.unicodeScalars.map(\.value).reduce(0) {
Int($1) &+ ($0 << 6) &+ ($0 << 16) - $0
}
}

View file

@ -1,31 +0,0 @@
@testable import SerializableBloomFilter
import XCTest
final class SerializableBloomFilterTests: XCTestCase {
func testContains() {
var filter = SerializableBloomFilter()
filter.insert("lol")
filter.insert("ok")
XCTAssert(filter.contains("lol"))
XCTAssert(filter.contains("ok"))
XCTAssertFalse(filter.contains("wtf"))
XCTAssertFalse(filter.contains("no"))
}
func testSerialization() throws {
var filter = SerializableBloomFilter()
filter.insert("lol")
filter.insert("ok")
let serialization = filter.serialization
let deserializedFilter = try SerializableBloomFilter(serialization: serialization)
XCTAssert(deserializedFilter.contains("lol"))
XCTAssert(filter.contains("ok"))
XCTAssertFalse(deserializedFilter.contains("wtf"))
XCTAssertFalse(filter.contains("no"))
}
}