Support status links with non-ASCII characters (Bugfix 1546) (#1550)

* Allow creation of URL objects from strings containing non-ASCII characters

Adds a new initializer for creating URL objects with a flag to specify that
non-ASCII characters found in the path or query string should first be
URL encoded.

* Add basic test for creating HTMLString objects

* Encode link paths and queries when parsing statuses

It's common to use non-ASCII characters in URLs even though they're technically
invalid characters. Every modern browser handles this by silently encoding
the invalid characters on the user's behalf. However, trying to create a URL
object with un-encoded characters will result in nil so we need to encode the
invalid characters before creating the URL object. The unencoded version
should still be shown in the displayed status.

The parsing of the URL string is a little messy because we can't use the URL
class for this scenario and need to duplicate some of its work.

* Only encode link URLs as a backup

If a URL can be created from a status href, don't try URL encoding
it as this could result in double encoding. Only encode the string
if the creation of a URL fails. This is also more efficient.
This commit is contained in:
Grant McSheffrey 2023-08-23 01:08:12 -04:00 committed by GitHub
parent edf36d4b30
commit 30f9da06c8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
3 changed files with 141 additions and 30 deletions

View file

@ -11,7 +11,7 @@ public struct HTMLString: Codable, Equatable, Hashable, @unchecked Sendable {
public var asMarkdown: String = "" public var asMarkdown: String = ""
public var asRawText: String = "" public var asRawText: String = ""
public var statusesURLs = [URL]() public var statusesURLs = [URL]()
public private(set) var links = [Link]() private(set) public var links = [Link]()
public var asSafeMarkdownAttributedString: AttributedString = .init() public var asSafeMarkdownAttributedString: AttributedString = .init()
private var main_regex: NSRegularExpression? private var main_regex: NSRegularExpression?
@ -151,14 +151,25 @@ public struct HTMLString: Codable, Equatable, Hashable, @unchecked Sendable {
handleNode(node: nn) handleNode(node: nn)
} }
let finish = asMarkdown.endIndex let finish = asMarkdown.endIndex
var linkRef = href
// Try creating a URL from the string. If it fails, try URL encoding
// the string first.
var url = URL(string: href)
if url == nil {
url = URL(string: href, encodePath: true)
}
if let linkUrl = url {
linkRef = linkUrl.absoluteString
let displayString = asMarkdown[start..<finish]
links.append(Link(linkUrl, displayString: String(displayString)))
}
asMarkdown += "](" asMarkdown += "]("
asMarkdown += href asMarkdown += linkRef
asMarkdown += ")" asMarkdown += ")"
if let url = URL(string: href) {
let displayString = asMarkdown[start ..< finish]
links.append(Link(url, displayString: String(displayString)))
}
return return
} else if node.nodeName() == "#text" { } else if node.nodeName() == "#text" {
var txt = node.description var txt = node.description
@ -190,19 +201,19 @@ public struct HTMLString: Codable, Equatable, Hashable, @unchecked Sendable {
self.displayString = displayString self.displayString = displayString
switch displayString.first { switch displayString.first {
case "@": case "@":
type = .mention self.type = .mention
title = displayString self.title = displayString
case "#": case "#":
type = .hashtag self.type = .hashtag
title = String(displayString.dropFirst()) self.title = String(displayString.dropFirst())
default: default:
type = .url self.type = .url
var hostNameUrl = url.host ?? url.absoluteString var hostNameUrl = url.host ?? url.absoluteString
if hostNameUrl.hasPrefix("www.") { if hostNameUrl.hasPrefix("www.") {
hostNameUrl = String(hostNameUrl.dropFirst(4)) hostNameUrl = String(hostNameUrl.dropFirst(4))
} }
title = hostNameUrl self.title = hostNameUrl
} }
} }
@ -213,3 +224,45 @@ public struct HTMLString: Codable, Equatable, Hashable, @unchecked Sendable {
} }
} }
} }
extension URL {
// It's common to use non-ASCII characters in URLs even though they're technically
// invalid characters. Every modern browser handles this by silently encoding
// the invalid characters on the user's behalf. However, trying to create a URL
// object with un-encoded characters will result in nil so we need to encode the
// invalid characters before creating the URL object. The unencoded version
// should still be shown in the displayed status.
public init?(string: String, encodePath: Bool) {
var encodedUrlString = ""
if encodePath,
string.starts(with: "http://") || string.starts(with: "https://"),
var startIndex = string.firstIndex(of: "/")
{
startIndex = string.index(startIndex, offsetBy: 1)
// We don't want to encode the host portion of the URL
if var startIndex = string[startIndex...].firstIndex(of: "/") {
encodedUrlString = String(string[...startIndex])
while let endIndex = string[string.index(after: startIndex)...].firstIndex(of: "/") {
let componentStartIndex = string.index(after: startIndex)
encodedUrlString = encodedUrlString + (string[componentStartIndex...endIndex].addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) ?? "")
startIndex = endIndex
}
// The last part of the path may have a query string appended to it
let componentStartIndex = string.index(after: startIndex)
if let queryStartIndex = string[componentStartIndex...].firstIndex(of: "?") {
encodedUrlString = encodedUrlString + (string[componentStartIndex..<queryStartIndex].addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) ?? "")
encodedUrlString = encodedUrlString + (string[queryStartIndex...].addingPercentEncoding(withAllowedCharacters: .urlQueryAllowed) ?? "")
} else {
encodedUrlString = encodedUrlString + (string[componentStartIndex...].addingPercentEncoding(withAllowedCharacters: .urlPathAllowed) ?? "")
}
}
}
if encodedUrlString.isEmpty {
encodedUrlString = string
}
self.init(string: encodedUrlString)
}
}

View file

@ -0,0 +1,69 @@
@testable import Models
import XCTest
final class HTMLStringTests: XCTestCase {
func testURLInit() throws {
XCTAssertNil(URL(string: "go to www.google.com", encodePath: true))
XCTAssertNil(URL(string: "go to www.google.com", encodePath: false))
XCTAssertNil(URL(string: "", encodePath: true))
let simpleUrl = URL(string: "https://www.google.com", encodePath: true)
XCTAssertEqual("https://www.google.com", simpleUrl?.absoluteString)
let urlWithTrailingSlash = URL(string: "https://www.google.com/", encodePath: true)
XCTAssertEqual("https://www.google.com/", urlWithTrailingSlash?.absoluteString)
let extendedCharPath = URL(string: "https://en.wikipedia.org/wiki/Elbbrücken_station", encodePath: true)
XCTAssertEqual("https://en.wikipedia.org/wiki/Elbbr%C3%BCcken_station", extendedCharPath?.absoluteString)
XCTAssertNil(URL(string: "https://en.wikipedia.org/wiki/Elbbrücken_station", encodePath: false))
let extendedCharQuery = URL(string: "http://test.com/blah/city?name=京都市", encodePath: true)
XCTAssertEqual("http://test.com/blah/city?name=%E4%BA%AC%E9%83%BD%E5%B8%82", extendedCharQuery?.absoluteString)
// Double encoding will happen if you ask to encodePath on an already encoded string
let alreadyEncodedPath = URL(string: "https://en.wikipedia.org/wiki/Elbbr%C3%BCcken_station", encodePath: true)
XCTAssertEqual("https://en.wikipedia.org/wiki/Elbbr%25C3%25BCcken_station", alreadyEncodedPath?.absoluteString)
}
func testHTMLStringInit() throws {
let decoder = JSONDecoder()
let basicContent = "\"<p>This is a test</p>\""
var htmlString = try decoder.decode(HTMLString.self, from: Data(basicContent.utf8))
XCTAssertEqual("This is a test", htmlString.asRawText)
XCTAssertEqual("<p>This is a test</p>", htmlString.htmlValue)
XCTAssertEqual("This is a test", htmlString.asMarkdown)
XCTAssertEqual(0, htmlString.statusesURLs.count)
XCTAssertEqual(0, htmlString.links.count)
let basicLink = "\"<p>This is a <a href=\\\"https://test.com\\\">test</a></p>\""
htmlString = try decoder.decode(HTMLString.self, from: Data(basicLink.utf8))
XCTAssertEqual("This is a test", htmlString.asRawText)
XCTAssertEqual("<p>This is a <a href=\"https://test.com\">test</a></p>", htmlString.htmlValue)
XCTAssertEqual("This is a [test](https://test.com)", htmlString.asMarkdown)
XCTAssertEqual(0, htmlString.statusesURLs.count)
XCTAssertEqual(1, htmlString.links.count)
XCTAssertEqual("https://test.com", htmlString.links[0].url.absoluteString)
XCTAssertEqual("test", htmlString.links[0].displayString)
let extendedCharLink = "\"<p>This is a <a href=\\\"https://test.com/goßëña\\\">test</a></p>\""
htmlString = try decoder.decode(HTMLString.self, from: Data(extendedCharLink.utf8))
XCTAssertEqual("This is a test", htmlString.asRawText)
XCTAssertEqual("<p>This is a <a href=\"https://test.com/goßëña\">test</a></p>", htmlString.htmlValue)
XCTAssertEqual("This is a [test](https://test.com/go%C3%9F%C3%AB%C3%B1a)", htmlString.asMarkdown)
XCTAssertEqual(0, htmlString.statusesURLs.count)
XCTAssertEqual(1, htmlString.links.count)
XCTAssertEqual("https://test.com/go%C3%9F%C3%AB%C3%B1a", htmlString.links[0].url.absoluteString)
XCTAssertEqual("test", htmlString.links[0].displayString)
let alreadyEncodedLink = "\"<p>This is a <a href=\\\"https://test.com/go%C3%9F%C3%AB%C3%B1a\\\">test</a></p>\""
htmlString = try decoder.decode(HTMLString.self, from: Data(alreadyEncodedLink.utf8))
XCTAssertEqual("This is a test", htmlString.asRawText)
XCTAssertEqual("<p>This is a <a href=\"https://test.com/go%C3%9F%C3%AB%C3%B1a\">test</a></p>", htmlString.htmlValue)
XCTAssertEqual("This is a [test](https://test.com/go%C3%9F%C3%AB%C3%B1a)", htmlString.asMarkdown)
XCTAssertEqual(0, htmlString.statusesURLs.count)
XCTAssertEqual(1, htmlString.links.count)
XCTAssertEqual("https://test.com/go%C3%9F%C3%AB%C3%B1a", htmlString.links[0].url.absoluteString)
XCTAssertEqual("test", htmlString.links[0].displayString)
}
}

View file

@ -1,11 +0,0 @@
@testable import Models
import XCTest
final class ModelsTests: XCTestCase {
func testExample() throws {
// This is an example of a functional test case.
// Use XCTAssert and related functions to verify your tests produce the correct
// results.
XCTAssertEqual(Models().text, "Hello, World!")
}
}