mirror of
https://github.com/Dimillian/IceCubesApp.git
synced 2024-11-25 17:51:01 +00:00
Newer, better, faster HTML to Markdown converter (#610)
* This is a more correct html to markdown implementation. It removes all the hacks and just uses the SwiftSoup parser correctly It will fix issue #576 and also an unlogged issue with the old implementation that could corrupt urls with underscores in them. * Better <br> / linefeed handling * Cleanup --------- Co-authored-by: Thomas Ricouard <ricouard77@gmail.com>
This commit is contained in:
parent
84d111999e
commit
624d4766fa
4 changed files with 106 additions and 60 deletions
|
@ -9,15 +9,6 @@
|
|||
"version" : "1.2.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "html2markdown",
|
||||
"kind" : "remoteSourceControl",
|
||||
"location" : "https://gitlab.com/mflint/HTML2Markdown",
|
||||
"state" : {
|
||||
"revision" : "00d7a9744bbd1e7762c587bbd248775e16345a65",
|
||||
"version" : "1.0.0"
|
||||
}
|
||||
},
|
||||
{
|
||||
"identity" : "keychain-swift",
|
||||
"kind" : "remoteSourceControl",
|
||||
|
|
|
@ -1,6 +1,5 @@
|
|||
import EmojiText
|
||||
import Foundation
|
||||
import HTML2Markdown
|
||||
import Models
|
||||
import SwiftUI
|
||||
|
||||
|
|
|
@ -16,14 +16,12 @@ let package = Package(
|
|||
),
|
||||
],
|
||||
dependencies: [
|
||||
.package(url: "https://gitlab.com/mflint/HTML2Markdown", exact: "1.0.0"),
|
||||
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.4.3"),
|
||||
],
|
||||
targets: [
|
||||
.target(
|
||||
name: "Models",
|
||||
dependencies: ["HTML2Markdown",
|
||||
"SwiftSoup"]
|
||||
dependencies: ["SwiftSoup"]
|
||||
),
|
||||
.testTarget(
|
||||
name: "ModelsTests",
|
||||
|
|
|
@ -1,14 +1,14 @@
|
|||
import Foundation
|
||||
import HTML2Markdown
|
||||
import SwiftSoup
|
||||
import SwiftUI
|
||||
|
||||
public struct HTMLString: Decodable, Equatable, Hashable {
|
||||
public var htmlValue: String
|
||||
public let asMarkdown: String
|
||||
public let asRawText: String
|
||||
public let statusesURLs: [URL]
|
||||
public let asSafeMarkdownAttributedString: AttributedString
|
||||
public var htmlValue: String = ""
|
||||
public var asMarkdown: String = ""
|
||||
public var asRawText: String = ""
|
||||
public var statusesURLs = [URL]()
|
||||
public var asSafeMarkdownAttributedString: AttributedString = AttributedString()
|
||||
private var regex: NSRegularExpression?
|
||||
|
||||
public init(from decoder: Decoder) {
|
||||
do {
|
||||
|
@ -19,51 +19,21 @@ public struct HTMLString: Decodable, Equatable, Hashable {
|
|||
}
|
||||
|
||||
// https://daringfireball.net/projects/markdown/syntax
|
||||
// HTML2Markdown only auto escapes * on the way out
|
||||
// so we pre-escape \ ` _ and [ as these are the only
|
||||
// other characters the markdown parser used picks up
|
||||
// Pre-escape \ ` _ * and [ as these are the only
|
||||
// characters the markdown parser used picks up
|
||||
// when it renders to attributed text
|
||||
if let regex = try? NSRegularExpression(pattern: "([\\_\\`\\[\\\\])", options: .caseInsensitive) {
|
||||
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "\\\\$1")
|
||||
}
|
||||
|
||||
// match intended mastodon presentation
|
||||
// strip out <span="invisible">blah</span>
|
||||
// append ellipsis to <span="ellipsis">blah</span>
|
||||
if let regex = try? NSRegularExpression(pattern: "(<span class=\"invisible\">.*?</span>)", options: .caseInsensitive) {
|
||||
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "")
|
||||
}
|
||||
if let regex = try? NSRegularExpression(pattern: "(<span class=\"ellipsis\">(.*?)</span>)", options: .caseInsensitive) {
|
||||
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "$2…")
|
||||
}
|
||||
regex = try? NSRegularExpression(pattern: "([\\_\\*\\`\\[\\\\])", options: .caseInsensitive)
|
||||
|
||||
asMarkdown = ""
|
||||
do {
|
||||
asMarkdown = try HTMLParser().parse(html: htmlValue)
|
||||
.toMarkdown()
|
||||
.replacingOccurrences(of: ")[", with: ") [")
|
||||
} catch {
|
||||
asMarkdown = htmlValue
|
||||
}
|
||||
|
||||
var statusesURLs: [URL] = []
|
||||
do {
|
||||
let document: Document = try SwiftSoup.parse(htmlValue)
|
||||
let links: Elements = try document.select("a")
|
||||
for link in links {
|
||||
let href = try link.attr("href")
|
||||
if let url = URL(string: href),
|
||||
let _ = Int(url.lastPathComponent)
|
||||
{
|
||||
statusesURLs.append(url)
|
||||
}
|
||||
}
|
||||
handleNode(node: document)
|
||||
asRawText = try document.text()
|
||||
} catch {
|
||||
asRawText = htmlValue
|
||||
}
|
||||
|
||||
self.statusesURLs = statusesURLs
|
||||
|
||||
do {
|
||||
let options = AttributedString.MarkdownParsingOptions(allowsExtendedAttributes: true,
|
||||
interpretedSyntax: .inlineOnlyPreservingWhitespace)
|
||||
|
@ -80,4 +50,92 @@ public struct HTMLString: Decodable, Equatable, Hashable {
|
|||
statusesURLs = []
|
||||
asSafeMarkdownAttributedString = AttributedString(stringLiteral: htmlValue)
|
||||
}
|
||||
|
||||
private mutating func handleNode(node: SwiftSoup.Node ) {
|
||||
|
||||
|
||||
do {
|
||||
if let className = try? node.attr("class") {
|
||||
if className == "invisible" {
|
||||
// don't display
|
||||
return
|
||||
}
|
||||
|
||||
if className == "ellipsis" {
|
||||
// descend into this one now and
|
||||
// append the ellipsis
|
||||
for nn in node.getChildNodes() {
|
||||
handleNode(node: nn)
|
||||
}
|
||||
asMarkdown += "…"
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
if node.nodeName() == "p" {
|
||||
if asMarkdown.count > 0 { // ignore first opening <p>
|
||||
asMarkdown += "\n\n"
|
||||
}
|
||||
}
|
||||
else if node.nodeName() == "br" {
|
||||
if asMarkdown.count > 0 { // ignore first opening <br>
|
||||
|
||||
// some code to try and stop double carriage rerturns where they aren't required
|
||||
// not perfect but effective in almost all cases
|
||||
if !asMarkdown.hasSuffix("\n") && !asMarkdown.hasSuffix("\u{2028}") {
|
||||
if let next = node.nextSibling() {
|
||||
if next.nodeName() == "#text" && (next.description.hasPrefix("\n") || next.description.hasPrefix("\u{2028}")) {
|
||||
// do nothing
|
||||
}
|
||||
else {
|
||||
asMarkdown += "\n"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else if node.nodeName() == "a" {
|
||||
let href = try node.attr("href")
|
||||
if href != "" {
|
||||
if let url = URL(string: href),
|
||||
let _ = Int(url.lastPathComponent)
|
||||
{
|
||||
statusesURLs.append(url)
|
||||
}
|
||||
}
|
||||
asMarkdown += "["
|
||||
// descend into this node now so we can wrap the
|
||||
// inner part of the link in the right markup
|
||||
for nn in node.getChildNodes() {
|
||||
handleNode(node: nn)
|
||||
}
|
||||
asMarkdown += "]("
|
||||
asMarkdown += href
|
||||
asMarkdown += ")"
|
||||
return
|
||||
}
|
||||
else if node.nodeName() == "#text" {
|
||||
|
||||
var txt = node.description
|
||||
|
||||
if let regex {
|
||||
// This is the markdown escaper
|
||||
txt = regex.stringByReplacingMatches(in: txt, options: [], range: NSRange(location: 0, length: txt.count), withTemplate: "\\\\$1")
|
||||
}
|
||||
|
||||
asMarkdown += txt
|
||||
}
|
||||
|
||||
for n in node.getChildNodes() {
|
||||
handleNode(node: n)
|
||||
}
|
||||
|
||||
}
|
||||
catch {
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue