mirror of
https://github.com/metabolist/metatext.git
synced 2025-01-21 18:48:06 +00:00
Make bloom filter codable
This commit is contained in:
parent
fd71f48a59
commit
321cea3ccd
9 changed files with 154 additions and 114 deletions
21
CodableBloomFilter/Package.swift
Normal file
21
CodableBloomFilter/Package.swift
Normal file
|
@ -0,0 +1,21 @@
|
|||
// swift-tools-version:5.3
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "CodableBloomFilter",
|
||||
products: [
|
||||
.library(
|
||||
name: "CodableBloomFilter",
|
||||
targets: ["CodableBloomFilter"])
|
||||
],
|
||||
dependencies: [],
|
||||
targets: [
|
||||
.target(
|
||||
name: "CodableBloomFilter",
|
||||
dependencies: []),
|
||||
.testTarget(
|
||||
name: "CodableBloomFilterTests",
|
||||
dependencies: ["CodableBloomFilter"])
|
||||
]
|
||||
)
|
|
@ -0,0 +1,92 @@
|
|||
// Copyright © 2020 Metabolist. All rights reserved.
|
||||
|
||||
import Foundation
|
||||
|
||||
// https://en.wikipedia.org/wiki/Bloom_filter
|
||||
// https://khanlou.com/2018/09/bloom-filters/
|
||||
// This implementation uses deterministic hashing functions so it can be serialized / deserialized
|
||||
|
||||
struct BloomFilter {
|
||||
let hashes: [Hash]
|
||||
let bitCount: Int
|
||||
|
||||
private var bits: Bits
|
||||
|
||||
init(hashes: [Hash], bitCount: Int) {
|
||||
self.hashes = hashes
|
||||
self.bitCount = bitCount
|
||||
bits = Bits(count: bitCount)
|
||||
}
|
||||
}
|
||||
|
||||
extension BloomFilter {
|
||||
enum Hash: String, Codable {
|
||||
case djb2
|
||||
case sdbm
|
||||
}
|
||||
|
||||
mutating func insert(_ newMember: String) {
|
||||
for index in indices(newMember) {
|
||||
bits[index] = true
|
||||
}
|
||||
}
|
||||
|
||||
func contains(_ member: String) -> Bool {
|
||||
indices(member).map { bits[$0] }.allSatisfy { $0 }
|
||||
}
|
||||
}
|
||||
|
||||
extension BloomFilter: Codable {
|
||||
enum CodingKeys: String, CodingKey {
|
||||
case hashes
|
||||
case bitCount
|
||||
case data
|
||||
}
|
||||
|
||||
init(from decoder: Decoder) throws {
|
||||
let container = try decoder.container(keyedBy: CodingKeys.self)
|
||||
let data = try container.decode(Data.self, forKey: .data)
|
||||
|
||||
hashes = try container.decode([Hash].self, forKey: .hashes)
|
||||
bitCount = try container.decode(Int.self, forKey: .bitCount)
|
||||
bits = Bits(bytes: Array(data), count: bitCount)
|
||||
}
|
||||
|
||||
func encode(to encoder: Encoder) throws {
|
||||
var container = encoder.container(keyedBy: CodingKeys.self)
|
||||
|
||||
try container.encode(hashes, forKey: .hashes)
|
||||
try container.encode(bitCount, forKey: .bitCount)
|
||||
try container.encode(bits.data, forKey: .data)
|
||||
}
|
||||
}
|
||||
|
||||
private extension BloomFilter {
|
||||
func indices(_ string: String) -> [Int] {
|
||||
hashes.map { abs($0.apply(string)) % bitCount }
|
||||
}
|
||||
}
|
||||
|
||||
// https://gist.github.com/kharrison/2355182ac03b481921073c5cf6d77a73
|
||||
|
||||
private extension BloomFilter.Hash {
|
||||
func apply(_ string: String) -> Int {
|
||||
string.unicodeScalars.map(\.value).reduce(initial, then)
|
||||
}
|
||||
|
||||
var initial: Int {
|
||||
switch self {
|
||||
case .djb2: return 5381
|
||||
case .sdbm: return 0
|
||||
}
|
||||
}
|
||||
|
||||
func then(result: Int, next: UInt32) -> Int {
|
||||
switch self {
|
||||
case .djb2:
|
||||
return (result << 5) &+ result &+ Int(next)
|
||||
case .sdbm:
|
||||
return Int(next) &+ (result << 6) &+ (result << 16) - result
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,39 @@
|
|||
@testable import CodableBloomFilter
|
||||
import XCTest
|
||||
|
||||
final class CodableBloomFilterTests: XCTestCase {
|
||||
func testContains() {
|
||||
var sut = BloomFilter(hashes: [.djb2, .sdbm], bitCount: 1024)
|
||||
|
||||
sut.insert("lol")
|
||||
sut.insert("ok")
|
||||
|
||||
XCTAssert(sut.contains("lol"))
|
||||
XCTAssert(sut.contains("ok"))
|
||||
XCTAssertFalse(sut.contains("wtf"))
|
||||
XCTAssertFalse(sut.contains("no"))
|
||||
}
|
||||
|
||||
func testCoding() throws {
|
||||
var sut = BloomFilter(hashes: [.djb2, .sdbm], bitCount: 64)
|
||||
let expectedSerialization = Data(#"{"bitCount":64,"data":"ABAAAAACAJA=","hashes":["djb2","sdbm"]}"#.utf8)
|
||||
|
||||
sut.insert("lol")
|
||||
sut.insert("ok")
|
||||
|
||||
let encoder = JSONEncoder()
|
||||
|
||||
encoder.outputFormatting = .sortedKeys
|
||||
|
||||
let serialization = try encoder.encode(sut)
|
||||
|
||||
XCTAssertEqual(serialization, expectedSerialization)
|
||||
|
||||
let decoded = try JSONDecoder().decode(BloomFilter.self, from: serialization)
|
||||
|
||||
XCTAssert(decoded.contains("lol"))
|
||||
XCTAssert(decoded.contains("ok"))
|
||||
XCTAssertFalse(decoded.contains("wtf"))
|
||||
XCTAssertFalse(decoded.contains("no"))
|
||||
}
|
||||
}
|
|
@ -86,7 +86,6 @@
|
|||
D047FA8C24C3E21200AF17C5 /* Metatext.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = Metatext.app; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
D0666A2124C677B400F3F04B /* Tests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = Tests.xctest; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
D0666A2524C677B400F3F04B /* Info.plist */ = {isa = PBXFileReference; lastKnownFileType = text.plist.xml; path = Info.plist; sourceTree = "<group>"; };
|
||||
D07E164425037264008B10D0 /* SerializableBloomFilter */ = {isa = PBXFileReference; lastKnownFileType = folder; path = SerializableBloomFilter; sourceTree = "<group>"; };
|
||||
D085C3BB25008DEC008A6C5E /* DB */ = {isa = PBXFileReference; lastKnownFileType = folder; path = DB; sourceTree = "<group>"; };
|
||||
D0BDF66524FD7A6400C7FA1C /* ServiceLayer */ = {isa = PBXFileReference; lastKnownFileType = folder; path = ServiceLayer; sourceTree = "<group>"; };
|
||||
D0BEB1F224F8EE8C001B0F04 /* AttachmentView.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = AttachmentView.swift; sourceTree = "<group>"; };
|
||||
|
@ -122,6 +121,7 @@
|
|||
D0C7D46E24F76169001EBDBB /* KingfisherOptionsInfo+Extensions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "KingfisherOptionsInfo+Extensions.swift"; sourceTree = "<group>"; };
|
||||
D0C7D46F24F76169001EBDBB /* View+Extensions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "View+Extensions.swift"; sourceTree = "<group>"; };
|
||||
D0C7D47124F76169001EBDBB /* Data+Extensions.swift */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.swift; path = "Data+Extensions.swift"; sourceTree = "<group>"; };
|
||||
D0D7C013250440610039AD6F /* CodableBloomFilter */ = {isa = PBXFileReference; lastKnownFileType = folder; path = CodableBloomFilter; sourceTree = "<group>"; };
|
||||
D0E0F1E424FC49FC002C04BF /* Mastodon */ = {isa = PBXFileReference; lastKnownFileType = folder; path = Mastodon; sourceTree = "<group>"; };
|
||||
D0E2C1CF24FD8BA400854680 /* ViewModels */ = {isa = PBXFileReference; lastKnownFileType = folder; path = ViewModels; sourceTree = "<group>"; };
|
||||
D0E5361924E3EB4D00FB1CE1 /* Notification Service Extension.appex */ = {isa = PBXFileReference; explicitFileType = "wrapper.app-extension"; includeInIndex = 0; path = "Notification Service Extension.appex"; sourceTree = BUILT_PRODUCTS_DIR; };
|
||||
|
@ -182,8 +182,8 @@
|
|||
D047FA7F24C3E21000AF17C5 = {
|
||||
isa = PBXGroup;
|
||||
children = (
|
||||
D07E164425037264008B10D0 /* SerializableBloomFilter */,
|
||||
D0C7D45224F76169001EBDBB /* Assets.xcassets */,
|
||||
D0D7C013250440610039AD6F /* CodableBloomFilter */,
|
||||
D085C3BB25008DEC008A6C5E /* DB */,
|
||||
D0C7D46824F76169001EBDBB /* Extensions */,
|
||||
D0666A7924C7745A00F3F04B /* Frameworks */,
|
||||
|
|
|
@ -1,25 +0,0 @@
|
|||
// swift-tools-version:5.3
|
||||
|
||||
import PackageDescription
|
||||
|
||||
let package = Package(
|
||||
name: "SerializableBloomFilter",
|
||||
platforms: [
|
||||
.iOS(.v14),
|
||||
.macOS(.v11)
|
||||
],
|
||||
products: [
|
||||
.library(
|
||||
name: "SerializableBloomFilter",
|
||||
targets: ["SerializableBloomFilter"])
|
||||
],
|
||||
dependencies: [],
|
||||
targets: [
|
||||
.target(
|
||||
name: "SerializableBloomFilter",
|
||||
dependencies: []),
|
||||
.testTarget(
|
||||
name: "SerializableBloomFilterTests",
|
||||
dependencies: ["SerializableBloomFilter"])
|
||||
]
|
||||
)
|
|
@ -1,56 +0,0 @@
|
|||
// Copyright © 2020 Metabolist. All rights reserved.
|
||||
|
||||
import Foundation
|
||||
|
||||
// https://en.wikipedia.org/wiki/Bloom_filter
|
||||
// https://khanlou.com/2018/09/bloom-filters/
|
||||
// This implementation uses deterministic hashing functions so it can be serialized / deserialized
|
||||
|
||||
struct SerializableBloomFilter {
|
||||
private var items: Bits
|
||||
|
||||
init() {
|
||||
items = Bits(count: Self.itemCount)
|
||||
}
|
||||
|
||||
init(serialization: Data) throws {
|
||||
items = Bits(bytes: Array(serialization), count: Self.itemCount)
|
||||
}
|
||||
}
|
||||
|
||||
extension SerializableBloomFilter {
|
||||
var serialization: Data { items.data }
|
||||
|
||||
mutating func insert(_ newMember: String) {
|
||||
for index in Self.indices(newMember) {
|
||||
items[index] = true
|
||||
}
|
||||
}
|
||||
|
||||
func contains(_ member: String) -> Bool {
|
||||
Self.indices(member).map { items[$0] }.allSatisfy { $0 }
|
||||
}
|
||||
}
|
||||
|
||||
private extension SerializableBloomFilter {
|
||||
static let itemCount = 1024
|
||||
static let hashFunctions = [djb2, sdbm]
|
||||
|
||||
static func indices(_ string: String) -> [Int] {
|
||||
hashFunctions.map { abs($0(string)) % itemCount }
|
||||
}
|
||||
}
|
||||
|
||||
// https://gist.github.com/kharrison/2355182ac03b481921073c5cf6d77a73
|
||||
|
||||
private func djb2(_ string: String) -> Int {
|
||||
string.unicodeScalars.map(\.value).reduce(5381) {
|
||||
($0 << 5) &+ $0 &+ Int($1)
|
||||
}
|
||||
}
|
||||
|
||||
private func sdbm(_ string: String) -> Int {
|
||||
string.unicodeScalars.map(\.value).reduce(0) {
|
||||
Int($1) &+ ($0 << 6) &+ ($0 << 16) - $0
|
||||
}
|
||||
}
|
|
@ -1,31 +0,0 @@
|
|||
@testable import SerializableBloomFilter
|
||||
import XCTest
|
||||
|
||||
final class SerializableBloomFilterTests: XCTestCase {
|
||||
func testContains() {
|
||||
var filter = SerializableBloomFilter()
|
||||
|
||||
filter.insert("lol")
|
||||
filter.insert("ok")
|
||||
|
||||
XCTAssert(filter.contains("lol"))
|
||||
XCTAssert(filter.contains("ok"))
|
||||
XCTAssertFalse(filter.contains("wtf"))
|
||||
XCTAssertFalse(filter.contains("no"))
|
||||
}
|
||||
|
||||
func testSerialization() throws {
|
||||
var filter = SerializableBloomFilter()
|
||||
|
||||
filter.insert("lol")
|
||||
filter.insert("ok")
|
||||
|
||||
let serialization = filter.serialization
|
||||
let deserializedFilter = try SerializableBloomFilter(serialization: serialization)
|
||||
|
||||
XCTAssert(deserializedFilter.contains("lol"))
|
||||
XCTAssert(filter.contains("ok"))
|
||||
XCTAssertFalse(deserializedFilter.contains("wtf"))
|
||||
XCTAssertFalse(filter.contains("no"))
|
||||
}
|
||||
}
|
Loading…
Reference in a new issue