mirror of
https://github.com/Dimillian/IceCubesApp.git
synced 2024-11-26 02:01:02 +00:00
Newer, better, faster HTML to Markdown converter (#610)
* This is a more correct html to markdown implementation. It removes all the hacks and just uses the SwiftSoup parser correctly It will fix issue #576 and also an unlogged issue with the old implementation that could corrupt urls with underscores in them. * Better <br> / linefeed handling * Cleanup --------- Co-authored-by: Thomas Ricouard <ricouard77@gmail.com>
This commit is contained in:
parent
84d111999e
commit
624d4766fa
4 changed files with 106 additions and 60 deletions
|
@ -9,15 +9,6 @@
|
||||||
"version" : "1.2.0"
|
"version" : "1.2.0"
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
{
|
|
||||||
"identity" : "html2markdown",
|
|
||||||
"kind" : "remoteSourceControl",
|
|
||||||
"location" : "https://gitlab.com/mflint/HTML2Markdown",
|
|
||||||
"state" : {
|
|
||||||
"revision" : "00d7a9744bbd1e7762c587bbd248775e16345a65",
|
|
||||||
"version" : "1.0.0"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{
|
{
|
||||||
"identity" : "keychain-swift",
|
"identity" : "keychain-swift",
|
||||||
"kind" : "remoteSourceControl",
|
"kind" : "remoteSourceControl",
|
||||||
|
|
|
@ -1,6 +1,5 @@
|
||||||
import EmojiText
|
import EmojiText
|
||||||
import Foundation
|
import Foundation
|
||||||
import HTML2Markdown
|
|
||||||
import Models
|
import Models
|
||||||
import SwiftUI
|
import SwiftUI
|
||||||
|
|
||||||
|
|
|
@ -16,14 +16,12 @@ let package = Package(
|
||||||
),
|
),
|
||||||
],
|
],
|
||||||
dependencies: [
|
dependencies: [
|
||||||
.package(url: "https://gitlab.com/mflint/HTML2Markdown", exact: "1.0.0"),
|
|
||||||
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.4.3"),
|
.package(url: "https://github.com/scinfu/SwiftSoup.git", from: "2.4.3"),
|
||||||
],
|
],
|
||||||
targets: [
|
targets: [
|
||||||
.target(
|
.target(
|
||||||
name: "Models",
|
name: "Models",
|
||||||
dependencies: ["HTML2Markdown",
|
dependencies: ["SwiftSoup"]
|
||||||
"SwiftSoup"]
|
|
||||||
),
|
),
|
||||||
.testTarget(
|
.testTarget(
|
||||||
name: "ModelsTests",
|
name: "ModelsTests",
|
||||||
|
|
|
@ -1,14 +1,14 @@
|
||||||
import Foundation
|
import Foundation
|
||||||
import HTML2Markdown
|
|
||||||
import SwiftSoup
|
import SwiftSoup
|
||||||
import SwiftUI
|
import SwiftUI
|
||||||
|
|
||||||
public struct HTMLString: Decodable, Equatable, Hashable {
|
public struct HTMLString: Decodable, Equatable, Hashable {
|
||||||
public var htmlValue: String
|
public var htmlValue: String = ""
|
||||||
public let asMarkdown: String
|
public var asMarkdown: String = ""
|
||||||
public let asRawText: String
|
public var asRawText: String = ""
|
||||||
public let statusesURLs: [URL]
|
public var statusesURLs = [URL]()
|
||||||
public let asSafeMarkdownAttributedString: AttributedString
|
public var asSafeMarkdownAttributedString: AttributedString = AttributedString()
|
||||||
|
private var regex: NSRegularExpression?
|
||||||
|
|
||||||
public init(from decoder: Decoder) {
|
public init(from decoder: Decoder) {
|
||||||
do {
|
do {
|
||||||
|
@ -19,51 +19,21 @@ public struct HTMLString: Decodable, Equatable, Hashable {
|
||||||
}
|
}
|
||||||
|
|
||||||
// https://daringfireball.net/projects/markdown/syntax
|
// https://daringfireball.net/projects/markdown/syntax
|
||||||
// HTML2Markdown only auto escapes * on the way out
|
// Pre-escape \ ` _ * and [ as these are the only
|
||||||
// so we pre-escape \ ` _ and [ as these are the only
|
// characters the markdown parser used picks up
|
||||||
// other characters the markdown parser used picks up
|
|
||||||
// when it renders to attributed text
|
// when it renders to attributed text
|
||||||
if let regex = try? NSRegularExpression(pattern: "([\\_\\`\\[\\\\])", options: .caseInsensitive) {
|
regex = try? NSRegularExpression(pattern: "([\\_\\*\\`\\[\\\\])", options: .caseInsensitive)
|
||||||
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "\\\\$1")
|
|
||||||
}
|
|
||||||
|
|
||||||
// match intended mastodon presentation
|
|
||||||
// strip out <span="invisible">blah</span>
|
|
||||||
// append ellipsis to <span="ellipsis">blah</span>
|
|
||||||
if let regex = try? NSRegularExpression(pattern: "(<span class=\"invisible\">.*?</span>)", options: .caseInsensitive) {
|
|
||||||
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "")
|
|
||||||
}
|
|
||||||
if let regex = try? NSRegularExpression(pattern: "(<span class=\"ellipsis\">(.*?)</span>)", options: .caseInsensitive) {
|
|
||||||
htmlValue = regex.stringByReplacingMatches(in: htmlValue, options: [], range: NSRange(location: 0, length: htmlValue.count), withTemplate: "$2…")
|
|
||||||
}
|
|
||||||
|
|
||||||
|
asMarkdown = ""
|
||||||
do {
|
do {
|
||||||
asMarkdown = try HTMLParser().parse(html: htmlValue)
|
|
||||||
.toMarkdown()
|
|
||||||
.replacingOccurrences(of: ")[", with: ") [")
|
|
||||||
} catch {
|
|
||||||
asMarkdown = htmlValue
|
|
||||||
}
|
|
||||||
|
|
||||||
var statusesURLs: [URL] = []
|
|
||||||
do {
|
|
||||||
let document: Document = try SwiftSoup.parse(htmlValue)
|
let document: Document = try SwiftSoup.parse(htmlValue)
|
||||||
let links: Elements = try document.select("a")
|
handleNode(node: document)
|
||||||
for link in links {
|
|
||||||
let href = try link.attr("href")
|
|
||||||
if let url = URL(string: href),
|
|
||||||
let _ = Int(url.lastPathComponent)
|
|
||||||
{
|
|
||||||
statusesURLs.append(url)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
asRawText = try document.text()
|
asRawText = try document.text()
|
||||||
} catch {
|
} catch {
|
||||||
asRawText = htmlValue
|
asRawText = htmlValue
|
||||||
}
|
}
|
||||||
|
|
||||||
self.statusesURLs = statusesURLs
|
|
||||||
|
|
||||||
do {
|
do {
|
||||||
let options = AttributedString.MarkdownParsingOptions(allowsExtendedAttributes: true,
|
let options = AttributedString.MarkdownParsingOptions(allowsExtendedAttributes: true,
|
||||||
interpretedSyntax: .inlineOnlyPreservingWhitespace)
|
interpretedSyntax: .inlineOnlyPreservingWhitespace)
|
||||||
|
@ -80,4 +50,92 @@ public struct HTMLString: Decodable, Equatable, Hashable {
|
||||||
statusesURLs = []
|
statusesURLs = []
|
||||||
asSafeMarkdownAttributedString = AttributedString(stringLiteral: htmlValue)
|
asSafeMarkdownAttributedString = AttributedString(stringLiteral: htmlValue)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
private mutating func handleNode(node: SwiftSoup.Node ) {
|
||||||
|
|
||||||
|
|
||||||
|
do {
|
||||||
|
if let className = try? node.attr("class") {
|
||||||
|
if className == "invisible" {
|
||||||
|
// don't display
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
if className == "ellipsis" {
|
||||||
|
// descend into this one now and
|
||||||
|
// append the ellipsis
|
||||||
|
for nn in node.getChildNodes() {
|
||||||
|
handleNode(node: nn)
|
||||||
|
}
|
||||||
|
asMarkdown += "…"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if node.nodeName() == "p" {
|
||||||
|
if asMarkdown.count > 0 { // ignore first opening <p>
|
||||||
|
asMarkdown += "\n\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if node.nodeName() == "br" {
|
||||||
|
if asMarkdown.count > 0 { // ignore first opening <br>
|
||||||
|
|
||||||
|
// some code to try and stop double carriage rerturns where they aren't required
|
||||||
|
// not perfect but effective in almost all cases
|
||||||
|
if !asMarkdown.hasSuffix("\n") && !asMarkdown.hasSuffix("\u{2028}") {
|
||||||
|
if let next = node.nextSibling() {
|
||||||
|
if next.nodeName() == "#text" && (next.description.hasPrefix("\n") || next.description.hasPrefix("\u{2028}")) {
|
||||||
|
// do nothing
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
asMarkdown += "\n"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
else if node.nodeName() == "a" {
|
||||||
|
let href = try node.attr("href")
|
||||||
|
if href != "" {
|
||||||
|
if let url = URL(string: href),
|
||||||
|
let _ = Int(url.lastPathComponent)
|
||||||
|
{
|
||||||
|
statusesURLs.append(url)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
asMarkdown += "["
|
||||||
|
// descend into this node now so we can wrap the
|
||||||
|
// inner part of the link in the right markup
|
||||||
|
for nn in node.getChildNodes() {
|
||||||
|
handleNode(node: nn)
|
||||||
|
}
|
||||||
|
asMarkdown += "]("
|
||||||
|
asMarkdown += href
|
||||||
|
asMarkdown += ")"
|
||||||
|
return
|
||||||
|
}
|
||||||
|
else if node.nodeName() == "#text" {
|
||||||
|
|
||||||
|
var txt = node.description
|
||||||
|
|
||||||
|
if let regex {
|
||||||
|
// This is the markdown escaper
|
||||||
|
txt = regex.stringByReplacingMatches(in: txt, options: [], range: NSRange(location: 0, length: txt.count), withTemplate: "\\\\$1")
|
||||||
|
}
|
||||||
|
|
||||||
|
asMarkdown += txt
|
||||||
|
}
|
||||||
|
|
||||||
|
for n in node.getChildNodes() {
|
||||||
|
handleNode(node: n)
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
catch {
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue