Rewrite text parsing to ensure correctness

This commit is contained in:
Zed 2019-10-09 16:06:51 +02:00
parent 83a651e732
commit 7b766b793b
7 changed files with 47 additions and 100 deletions

View file

@ -6,12 +6,6 @@ import types, utils, query
from unicode import Rune, `$` from unicode import Rune, `$`
const const
urlRegex = re"((https?|ftp)://(-\.)?([^\s/?\.#]+\.?)+([/\?][^\s\)]*)?)"
emailRegex = re"([a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+)"
usernameRegex = re"(^|[^A-z0-9_?\/])@([A-z0-9_]+)"
picRegex = re"pic.twitter.com/[^ ]+"
ellipsisRegex = re" ?…"
hashtagRegex = re"([^\S]|^)([#$]\w+)"
ytRegex = re"(www.|m.)?youtu(be.com|.be)" ytRegex = re"(www.|m.)?youtu(be.com|.be)"
twRegex = re"(www.|mobile.)?twitter.com" twRegex = re"(www.|mobile.)?twitter.com"
nbsp = $Rune(0x000A0) nbsp = $Rune(0x000A0)
@ -26,75 +20,14 @@ proc shortLink*(text: string; length=28): string =
if result.len > length: if result.len > length:
result = result[0 ..< length] & "" result = result[0 ..< length] & ""
proc toLink*(url, text: string): string = proc replaceUrl*(url: string; prefs: Prefs; rss=false): string =
a(text, href=url)
proc reUrlToShortLink*(m: RegexMatch; s: string): string =
let url = s[m.group(0)[0]]
toLink(url, shortLink(url))
proc reUrlToLink*(m: RegexMatch; s: string): string =
let url = s[m.group(0)[0]]
toLink(url, url.replace(re"https?://(www.)?", ""))
proc reEmailToLink*(m: RegexMatch; s: string): string =
let url = s[m.group(0)[0]]
toLink("mailto://" & url, url)
proc reHashtagToLink*(m: RegexMatch; s: string): string =
result = if m.group(0).len > 0: s[m.group(0)[0]] else: ""
let hash = s[m.group(1)[0]]
let link = toLink("/search?q=" & encodeUrl(hash), hash)
if hash.any(isAlphaAscii):
result &= link
else:
result &= hash
proc reUsernameToLink*(m: RegexMatch; s: string): string =
var username = ""
var pretext = ""
let pre = m.group(0)
let match = m.group(1)
username = s[match[0]]
if pre.len > 0:
pretext = s[pre[0]]
pretext & toLink("/" & username, "@" & username)
proc reUsernameToFullLink*(m: RegexMatch; s: string): string =
result = reUsernameToLink(m, s)
result = result.replace("href=\"/", &"href=\"https://{hostname}/")
proc replaceUrl*(url: string; prefs: Prefs): string =
result = url result = url
if prefs.replaceYouTube.len > 0: if prefs.replaceYouTube.len > 0:
result = result.replace(ytRegex, prefs.replaceYouTube) result = result.replace(ytRegex, prefs.replaceYouTube)
if prefs.replaceTwitter.len > 0: if prefs.replaceTwitter.len > 0:
result = result.replace(twRegex, prefs.replaceTwitter) result = result.replace(twRegex, prefs.replaceTwitter)
proc linkifyText*(text: string; prefs: Prefs; rss=false): string =
result = xmltree.escape(stripText(text))
result = result.replace(ellipsisRegex, " ")
result = result.replace(emailRegex, reEmailToLink)
if rss: if rss:
result = result.replace(urlRegex, reUrlToLink) result = result.replace("href=\"/", "href=\"" & hostname & "/")
result = result.replace(usernameRegex, reUsernameToFullLink)
else:
result = result.replace(urlRegex, reUrlToShortLink)
result = result.replace(usernameRegex, reUsernameToLink)
result = result.replace(hashtagRegex, reHashtagToLink)
result = result.replace(re"([^\s\(\n%])<a", "$1 <a")
result = result.replace(re"</a>\s+([;.,!\)'%]|&apos;)", "</a>$1")
result = result.replace(re"^\. <a", ".<a")
result = result.replaceUrl(prefs)
proc stripTwitterUrls*(text: string): string =
result = text
result = result.replace(picRegex, "")
result = result.replace(ellipsisRegex, "")
proc proxifyVideo*(manifest: string; proxy: bool): string = proc proxifyVideo*(manifest: string; proxy: bool): string =
proc cb(m: RegexMatch; s: string): string = proc cb(m: RegexMatch; s: string): string =

View file

@ -1,9 +1,10 @@
import xmltree, strtabs, strformat, strutils, times import xmltree, strtabs, strformat, strutils, times, uri
import regex import regex
import types, formatters import types, formatters
from q import nil from q import nil
from htmlgen import a
const const
thumbRegex = re".+:url\('([^']+)'\)" thumbRegex = re".+:url\('([^']+)'\)"
@ -41,29 +42,41 @@ proc isVerified*(profile: XmlNode): bool =
proc isProtected*(profile: XmlNode): bool = proc isProtected*(profile: XmlNode): bool =
getHeader(profile).select(".Icon.Icon--protected") != nil getHeader(profile).select(".Icon.Icon--protected") != nil
proc emojify*(node: XmlNode) = proc parseText*(text: XmlNode; skipLink=""): string =
for i in node.selectAll(".Emoji"): for el in text:
i.add newText(i.attr("alt")) case el.kind
of xnText:
result.add el
of xnElement:
if el.attrs == nil:
if el.tag == "strong":
result.add $el
continue
let class = el.attr("class")
if "data-expanded-url" in el.attrs:
let url = el.attr("data-expanded-url")
if url == skipLink: continue
elif "u-hidden" in class: result.add "\n"
result.add a(shortLink(url), href=url)
elif "ashtag" in class:
let hash = el.innerText()
result.add a(hash, href=("/search?q=" & encodeUrl(hash)))
elif "atreply" in class:
result.add a(el.innerText(), href=el.attr("href"))
elif "Emoji" in class:
result.add el.attr("alt")
else: discard
proc getQuoteText*(tweet: XmlNode): string = proc getQuoteText*(tweet: XmlNode): string =
let text = tweet.select(".QuoteTweet-text") parseText(tweet.select(".QuoteTweet-text"))
emojify(text)
result = stripText(text.innerText())
result = stripTwitterUrls(result)
proc getTweetText*(tweet: XmlNode): string = proc getTweetText*(tweet: XmlNode): string =
let let
quote = tweet.select(".QuoteTweet") quote = tweet.select(".QuoteTweet")
text = tweet.select(".tweet-text") text = tweet.select(".tweet-text")
link = text.selectAttr("a.twitter-timeline-link.u-hidden", "data-expanded-url") link = text.selectAttr("a.twitter-timeline-link.u-hidden", "data-expanded-url")
parseText(text, if quote != nil: link else: "")
emojify(text)
result = stripText(text.innerText())
if quote != nil and link.len > 0:
result = result.replace(link, "")
result = stripTwitterUrls(result)
proc getTime(tweet: XmlNode): XmlNode = proc getTime(tweet: XmlNode): XmlNode =
tweet.select(".js-short-timestamp") tweet.select(".js-short-timestamp")
@ -87,10 +100,10 @@ proc getUsername*(profile: XmlNode; selector: string): string =
profile.selectText(selector).strip(chars={'@', ' ', '\n'}) profile.selectText(selector).strip(chars={'@', ' ', '\n'})
proc getBio*(profile: XmlNode; selector: string; fallback=""): string = proc getBio*(profile: XmlNode; selector: string; fallback=""): string =
var bio = profile.selectText(selector) var bio = profile.select(selector)
if bio.len == 0 and fallback.len > 0: if bio == nil and fallback.len > 0:
bio = profile.selectText(fallback) bio = profile.select(fallback)
stripText(bio) parseText(bio)
proc getLocation*(profile: XmlNode): string = proc getLocation*(profile: XmlNode): string =
let sel = ".ProfileHeaderCard-locationText" let sel = ".ProfileHeaderCard-locationText"

View file

@ -25,7 +25,7 @@ proc renderProfileCard*(profile: Profile; prefs: Prefs): VNode =
tdiv(class="profile-card-extra"): tdiv(class="profile-card-extra"):
if profile.bio.len > 0: if profile.bio.len > 0:
tdiv(class="profile-bio"): tdiv(class="profile-bio"):
p: verbatim linkifyText(profile.bio, prefs) p: verbatim replaceUrl(profile.bio, prefs)
if profile.location.len > 0: if profile.location.len > 0:
tdiv(class="profile-location"): tdiv(class="profile-location"):
@ -39,8 +39,9 @@ proc renderProfileCard*(profile: Profile; prefs: Prefs): VNode =
if profile.website.len > 0: if profile.website.len > 0:
tdiv(class="profile-website"): tdiv(class="profile-website"):
span: span:
let url = replaceUrl(profile.website, prefs)
icon "link" icon "link"
verbatim linkifyText(profile.website, prefs) a(href=url): text shortLink(url)
tdiv(class="profile-joindate"): tdiv(class="profile-joindate"):
span(title=getJoinDateFull(profile)): span(title=getJoinDateFull(profile)):

View file

@ -7,7 +7,7 @@
#if tweet.pinned: result = "Pinned: " #if tweet.pinned: result = "Pinned: "
#elif tweet.retweet.isSome: result = "RT: " #elif tweet.retweet.isSome: result = "RT: "
#end if #end if
#result &= xmltree.escape(replaceUrl(tweet.text, prefs)) #result &= xmltree.escape(replaceUrl(tweet.text, prefs, rss=true))
#if result.len > 0: return #if result.len > 0: return
#end if #end if
#if tweet.photos.len > 0: #if tweet.photos.len > 0:
@ -20,7 +20,7 @@
#end proc #end proc
# #
#proc renderRssTweet(tweet: Tweet; prefs: Prefs): string = #proc renderRssTweet(tweet: Tweet; prefs: Prefs): string =
#let text = linkifyText(tweet.text, prefs, rss=true) #let text = replaceUrl(tweet.text, prefs, rss=true)
#if tweet.quote.isSome and get(tweet.quote).available: #if tweet.quote.isSome and get(tweet.quote).available:
#let quoteLink = hostname & getLink(get(tweet.quote)) #let quoteLink = hostname & getLink(get(tweet.quote))
<p>${text}<br><a href="https://${quoteLink}">${quoteLink}</a></p> <p>${text}<br><a href="https://${quoteLink}">${quoteLink}</a></p>
@ -58,7 +58,7 @@
#end proc #end proc
# #
#proc renderTimelineRss*(timeline: Timeline; profile: Profile): string = #proc renderTimelineRss*(timeline: Timeline; profile: Profile): string =
#let prefs = Prefs(replaceTwitter: hostname) #let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
#result = "" #result = ""
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
<rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0"> <rss xmlns:atom="http://www.w3.org/2005/Atom" xmlns:dc="http://purl.org/dc/elements/1.1/" version="2.0">
@ -84,7 +84,7 @@
#end proc #end proc
# #
#proc renderListRss*(tweets: seq[Tweet]; name, list: string): string = #proc renderListRss*(tweets: seq[Tweet]; name, list: string): string =
#let prefs = Prefs(replaceTwitter: hostname) #let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
#let link = &"https://{hostname}/{name}/lists/{list}" #let link = &"https://{hostname}/{name}/lists/{list}"
#result = "" #result = ""
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>
@ -102,7 +102,7 @@
#end proc #end proc
# #
#proc renderSearchRss*(tweets: seq[Tweet]; name, param: string): string = #proc renderSearchRss*(tweets: seq[Tweet]; name, param: string): string =
#let prefs = Prefs(replaceTwitter: hostname) #let prefs = Prefs(replaceTwitter: hostname, replaceYoutube: "invidio.us")
#let link = &"https://{hostname}/search" #let link = &"https://{hostname}/search"
#result = "" #result = ""
<?xml version="1.0" encoding="UTF-8"?> <?xml version="1.0" encoding="UTF-8"?>

View file

@ -56,7 +56,7 @@ proc renderUser(user: Profile; prefs: Prefs): VNode =
linkUser(user, class="username") linkUser(user, class="username")
tdiv(class="tweet-content media-body"): tdiv(class="tweet-content media-body"):
verbatim linkifyText(user.bio, prefs) verbatim replaceUrl(user.bio, prefs)
proc renderTimelineUsers*(results: Result[Profile]; prefs: Prefs; path=""): VNode = proc renderTimelineUsers*(results: Result[Profile]; prefs: Prefs; path=""): VNode =
buildHtml(tdiv(class="timeline")): buildHtml(tdiv(class="timeline")):

View file

@ -215,7 +215,7 @@ proc renderQuote(quote: Quote; prefs: Prefs): VNode =
renderReply(quote) renderReply(quote)
tdiv(class="quote-text"): tdiv(class="quote-text"):
verbatim linkifyText(quote.text, prefs) verbatim replaceUrl(quote.text, prefs)
if quote.hasThread: if quote.hasThread:
a(class="show-thread", href=getLink(quote)): a(class="show-thread", href=getLink(quote)):
@ -248,7 +248,7 @@ proc renderTweet*(tweet: Tweet; prefs: Prefs; path: string; class="";
renderReply(tweet) renderReply(tweet)
tdiv(class="tweet-content media-body"): tdiv(class="tweet-content media-body"):
verbatim linkifyText(tweet.text, prefs) verbatim replaceUrl(tweet.text, prefs)
if tweet.quote.isSome: if tweet.quote.isSome:
renderQuote(tweet.quote.get(), prefs) renderQuote(tweet.quote.get(), prefs)

View file

@ -51,7 +51,7 @@ link = [
'old.reddit.com/r/programming…' 'old.reddit.com/r/programming…'
]], ]],
['nim_lang/status/1125887775151140864', [ ['nim_lang/status/1125887775151140864', [
'en.wikipedia.org/wiki/Nim_(p…)' 'en.wikipedia.org/wiki/Nim_(p…'
]], ]],
['hiankun_taioan/status/1086916335215341570', [ ['hiankun_taioan/status/1086916335215341570', [
'(hackernoon.com/interview-wit…)' '(hackernoon.com/interview-wit…)'