Newer, better, faster HTML to Markdown converter (#610)

* This is a more correct html to markdown implementation.

It removes all the hacks and just uses the SwiftSoup parser correctly

It will fix issue #576 and also an unlogged issue with the old implementation that could corrupt urls with underscores in them.

* Better <br> / linefeed handling

* Cleanup

---------

Co-authored-by: Thomas Ricouard <ricouard77@gmail.com>
This commit is contained in:
Gareth Simpson 2023-02-04 07:16:19 +00:00 committed by GitHub
parent 84d111999e
commit 624d4766fa
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
4 changed files with 106 additions and 60 deletions

View file

@ -9,15 +9,6 @@
"version" : "1.2.0"
}
},
{
"identity" : "html2markdown",
"kind" : "remoteSourceControl",
"location" : "https://gitlab.com/mflint/HTML2Markdown",
"state" : {
"revision" : "00d7a9744bbd1e7762c587bbd248775e16345a65",
"version" : "1.0.0"
}
},
{
"identity" : "keychain-swift",
"kind" : "remoteSourceControl",

View file

@ -1,6 +1,5 @@
import EmojiText
import Foundation
import HTML2Markdown
import Models
import SwiftUI

View file

@ -16,14 +16,12 @@ let package = Package(
),
],
dependencies: [
.package(url: "https://gitlab.com/mflint/HTML2Markdown", exact: "1.0.0"),
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.4.3"),
],
targets: [
.target(
name: "Models",
dependencies: ["HTML2Markdown",
"SwiftSoup"]
dependencies: ["SwiftSoup"]
),
.testTarget(
name: "ModelsTests",

View file

@ -1,14 +1,14 @@
import Foundation
import HTML2Markdown
import SwiftSoup
import SwiftUI
public struct HTMLString: Decodable, Equatable, Hashable {
public var htmlValue: String
public let asMarkdown: String
public let asRawText: String
public let statusesURLs: [URL]
public let asSafeMarkdownAttributedString: AttributedString
public var htmlValue: String = ""
public var asMarkdown: String = ""
public var asRawText: String = ""
public var statusesURLs = [URL]()
public var asSafeMarkdownAttributedString: AttributedString = AttributedString()
private var regex: NSRegularExpression?
public init(from decoder: Decoder) {
do {
@ -19,51 +19,21 @@ public struct HTMLString: Decodable, Equatable, Hashable {
}
// https://daringfireball.net/projects/markdown/syntax
// HTML2Markdown only auto escapes * on the way out
// so we pre-escape \ ` _ and [ as these are the only
// other characters the markdown parser used picks up
// Pre-escape \ ` _ * and [ as these are the only
// characters the markdown parser used picks up
// when it renders to attributed text
if let regex = try? NSRegularExpression(pattern: "([\\_\\`\\[\\\\])", options: .caseInsensitive) {
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "\\\\$1")
}
// match intended mastodon presentation
// strip out <span="invisible">blah</span>
// append ellipsis to <span="ellipsis">blah</span>
if let regex = try? NSRegularExpression(pattern: "(<span class=\"invisible\">.*?</span>)", options: .caseInsensitive) {
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "")
}
if let regex = try? NSRegularExpression(pattern: "(<span class=\"ellipsis\">(.*?)</span>)", options: .caseInsensitive) {
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "$2…")
}
regex = try? NSRegularExpression(pattern: "([\\_\\*\\`\\[\\\\])", options: .caseInsensitive)
asMarkdown = ""
do {
asMarkdown = try HTMLParser().parse(html: htmlValue)
.toMarkdown()
.replacingOccurrences(of: ")[", with: ") [")
} catch {
asMarkdown = htmlValue
}
var statusesURLs: [URL] = []
do {
let document: Document = try SwiftSoup.parse(htmlValue)
let links: Elements = try document.select("a")
for link in links {
let href = try link.attr("href")
if let url = URL(string: href),
let _ = Int(url.lastPathComponent)
{
statusesURLs.append(url)
}
}
handleNode(node: document)
asRawText = try document.text()
} catch {
asRawText = htmlValue
}
self.statusesURLs = statusesURLs
do {
let options = AttributedString.MarkdownParsingOptions(allowsExtendedAttributes: true,
interpretedSyntax: .inlineOnlyPreservingWhitespace)
@ -80,4 +50,92 @@ public struct HTMLString: Decodable, Equatable, Hashable {
statusesURLs = []
asSafeMarkdownAttributedString = AttributedString(stringLiteral: htmlValue)
}
private mutating func handleNode(node: SwiftSoup.Node ) {
do {
if let className = try? node.attr("class") {
if className == "invisible" {
// don't display
return
}
if className == "ellipsis" {
// descend into this one now and
// append the ellipsis
for nn in node.getChildNodes() {
handleNode(node: nn)
}
asMarkdown += ""
return
}
}
if node.nodeName() == "p" {
if asMarkdown.count > 0 { // ignore first opening <p>
asMarkdown += "\n\n"
}
}
else if node.nodeName() == "br" {
if asMarkdown.count > 0 { // ignore first opening <br>
// some code to try and stop double carriage rerturns where they aren't required
// not perfect but effective in almost all cases
if !asMarkdown.hasSuffix("\n") && !asMarkdown.hasSuffix("\u{2028}") {
if let next = node.nextSibling() {
if next.nodeName() == "#text" && (next.description.hasPrefix("\n") || next.description.hasPrefix("\u{2028}")) {
// do nothing
}
else {
asMarkdown += "\n"
}
}
}
}
}
else if node.nodeName() == "a" {
let href = try node.attr("href")
if href != "" {
if let url = URL(string: href),
let _ = Int(url.lastPathComponent)
{
statusesURLs.append(url)
}
}
asMarkdown += "["
// descend into this node now so we can wrap the
// inner part of the link in the right markup
for nn in node.getChildNodes() {
handleNode(node: nn)
}
asMarkdown += "]("
asMarkdown += href
asMarkdown += ")"
return
}
else if node.nodeName() == "#text" {
var txt = node.description
if let regex {
// This is the markdown escaper
txt = regex.stringByReplacingMatches(in: txt, options: [], range: NSRange(location: 0, length: txt.count), withTemplate: "\\\\$1")
}
asMarkdown += txt
}
for n in node.getChildNodes() {
handleNode(node: n)
}
}
catch {
}
}
}