Fix parsing HTML with unicode URL links

This commit is contained in:
Justin Mazzocchi 2021-03-28 23:36:25 -07:00
parent 9552305a78
commit b7c52a3dd2
No known key found for this signature in database
GPG key ID: E223E6937AAFB01C
3 changed files with 27 additions and 11 deletions

View file

@ -93,7 +93,7 @@ extension HTMLParser: XMLParserDelegate {
attributes attributeDict: [String: String] = [:]) {
attributesStack.append(attributeDict)
if elementName == "a", let hrefString = attributeDict["href"], let href = URL(string: hrefString) {
if elementName == "a", let hrefString = attributeDict["href"], let href = URL(unicodeString: hrefString) {
currentLink = Link(href: href, location: constructedString.utf16.count)
} else if elementName == "br" {
constructedString.append("\n")
@ -113,6 +113,7 @@ extension HTMLParser: XMLParserDelegate {
if elementName == "a", var link = currentLink {
link.length = constructedString.utf16.count - link.location
links.insert(link)
currentLink = nil
} else if elementName == "p", parser.columnNumber < parseStopColumn {
constructedString.append("\n\n")
}

View file

@ -13,16 +13,7 @@ extension UnicodeURL: Codable {
raw = try container.decode(String.self)
if let url = URL(string: raw) {
self.url = url
} else if let escaped = raw.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) {
let colonUnescaped = escaped.replacingOccurrences(
of: "%3A",
with: ":",
range: escaped.range(of: "%3A"))
guard let url = URL(string: colonUnescaped) else { throw URLError(.badURL) }
if let url = URL(unicodeString: raw) {
self.url = url
} else {
throw URLError(.badURL)

View file

@ -0,0 +1,24 @@
// Copyright © 2021 Metabolist. All rights reserved.
import Foundation
extension URL {
init?(unicodeString: String) {
if let url = Self(string: unicodeString) {
self = url
} else if let escaped = unicodeString.addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) {
let colonUnescaped = escaped.replacingOccurrences(
of: "%3A",
with: ":",
range: escaped.range(of: "%3A"))
if let url = URL(string: colonUnescaped) {
self = url
} else {
return nil
}
} else {
return nil
}
}
}