nitter/src/formatters.nim

# SPDX-License-Identifier: AGPL-3.0-only
import strutils, strformat, times, uri, tables, xmltree, htmlparser, htmlgen
import regex
import types, utils, query

const
  ytRegex = re"([A-z.]+\.)?youtu(be\.com|\.be)"
  igRegex = re"(www\.)?instagram\.com"

  rdRegex = re"(?<![.b])((www|np|new|amp|old)\.)?reddit.com"
  rdShortRegex = re"(?<![.b])redd\.it\/"
  # Videos cannot be supported uniformly between Teddit and Libreddit,
  # so v.redd.it links will not be replaced.
  # Images aren't supported due to errors from Teddit when the image
  # wasn't first displayed via a post on the Teddit instance.

  twRegex = re"(?<=(?<!\S)https:\/\/|(?<=\s))(www\.|mobile\.)?twitter\.com"
  twLinkRegex = re"""<a href="https:\/\/twitter.com([^"]+)">twitter\.com(\S+)</a>"""

  cards = "cards.twitter.com/cards"
  tco = "https://t.co"

  wwwRegex = re"https?://(www[0-9]?\.)?"
  m3u8Regex = re"""url="(.+.m3u8)""""
  manifestRegex = re"\/(.+(.ts|.m4s|.m3u8|.vmap|.mp4))"
  userPicRegex = re"_(normal|bigger|mini|200x200|400x400)(\.[A-z]+)$"
  extRegex = re"(\.[A-z]+)$"
  illegalXmlRegex = re"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]"

  twitter = parseUri("https://twitter.com")

proc getUrlPrefix*(cfg: Config): string =
  if cfg.useHttps: https & cfg.hostname
  else: "http://" & cfg.hostname

proc stripHtml*(text: string): string =
  var html = parseHtml(text)
  for el in html.findAll("a"):
    let link = el.attr("href")
    if "http" in link:
      if el.len == 0: continue
      el[0].text = link
  html.innerText()

proc sanitizeXml*(text: string): string =
  text.replace(illegalXmlRegex, "")

proc shortLink*(text: string; length=28): string =
  result = text.replace(wwwRegex, "")
  if result.len > length:
    result = result[0 ..< length] & "…"

proc replaceUrls*(body: string; prefs: Prefs; absolute=""): string =
  result = body

  if prefs.replaceYouTube.len > 0 and ytRegex in result:
    result = result.replace(ytRegex, prefs.replaceYouTube)
    if prefs.replaceYouTube in result:
      result = result.replace("/c/", "/")

  if prefs.replaceTwitter.len > 0 and
     (twRegex in result or twLinkRegex in result or tco in result):
    result = result.replace(tco, https & prefs.replaceTwitter & "/t.co")
    result = result.replace(cards, prefs.replaceTwitter & "/cards")
    result = result.replace(twRegex, prefs.replaceTwitter)
    result = result.replace(twLinkRegex, a(
      prefs.replaceTwitter & "$2", href = https & prefs.replaceTwitter & "$1"))

  if prefs.replaceReddit.len > 0 and (rdRegex in result or "redd.it" in result):
    result = result.replace(rdShortRegex, prefs.replaceReddit & "/comments/")
    result = result.replace(rdRegex, prefs.replaceReddit)
    if prefs.replaceReddit in result and "/gallery/" in result:
      result = result.replace("/gallery/", "/comments/")

  if prefs.replaceInstagram.len > 0 and igRegex in result:
    result = result.replace(igRegex, prefs.replaceInstagram)

  if absolute.len > 0 and "href" in result:
    result = result.replace("href=\"/", "href=\"" & absolute & "/")

proc getM3u8Url*(content: string): string =
  var m: RegexMatch
  if content.find(m3u8Regex, m):
    result = content[m.group(0)[0]]

proc proxifyVideo*(manifest: string; proxy: bool): string =
  proc cb(m: RegexMatch; s: string): string =
    result = "https://video.twimg.com/" & s[m.group(0)[0]]
    if proxy: result = getVidUrl(result)
  result = manifest.replace(manifestRegex, cb)

proc getUserPic*(userPic: string; style=""): string =
  let pic = userPic.replace(userPicRegex, "$2")
  pic.replace(extRegex, style & "$1")

proc getUserPic*(profile: Profile; style=""): string =
  getUserPic(profile.userPic, style)

proc getVideoEmbed*(cfg: Config; id: int64): string =
  &"{getUrlPrefix(cfg)}/i/videos/{id}"

proc pageTitle*(profile: Profile): string =
  &"{profile.fullname} (@{profile.username})"

proc pageTitle*(tweet: Tweet): string =
  &"{pageTitle(tweet.profile)}: \"{stripHtml(tweet.text)}\""

proc pageDesc*(profile: Profile): string =
  if profile.bio.len > 0:
    stripHtml(profile.bio)
  else:
    "The latest tweets from " & profile.fullname

proc getJoinDate*(profile: Profile): string =
  profile.joinDate.format("'Joined' MMMM YYYY")

proc getJoinDateFull*(profile: Profile): string =
  profile.joinDate.format("h:mm tt - d MMM YYYY")

proc getTime*(tweet: Tweet): string =
  tweet.time.format("MMM d', 'YYYY' · 'h:mm tt' UTC'")

proc getRfc822Time*(tweet: Tweet): string =
  tweet.time.format("ddd', 'dd MMM yyyy HH:mm:ss 'GMT'")

proc getShortTime*(tweet: Tweet): string =
  let now = now()
  let since = now - tweet.time

  if now.year != tweet.time.year:
    result = tweet.time.format("d MMM yyyy")
  elif since.inDays >= 1:
    result = tweet.time.format("MMM d")
  elif since.inHours >= 1:
    result = $since.inHours & "h"
  elif since.inMinutes >= 1:
    result = $since.inMinutes & "m"
  elif since.inSeconds > 1:
    result = $since.inSeconds & "s"
  else:
    result = "now"

proc getLink*(tweet: Tweet; focus=true): string =
  if tweet.id == 0: return
  var username = tweet.profile.username
  if username.len == 0:
    username = "i"
  result = &"/{username}/status/{tweet.id}"
  if focus: result &= "#m"

proc getTwitterLink*(path: string; params: Table[string, string]): string =
  var
    username = params.getOrDefault("name")
    query = initQuery(params, username)
    path = path

  if "," in username:
    query.fromUser = username.split(",")
    path = "/search"

  if "/search" notin path and query.fromUser.len < 2:
    return $(twitter / path)

  let p = {
    "f": if query.kind == users: "user" else: "live",
    "q": genQueryParam(query),
    "src": "typed_query"
  }

  result = $(twitter / path ? p)
  if username.len > 0:
    result = result.replace("/" & username, "")

proc getLocation*(u: Profile | Tweet): (string, string) =
  if "://" in u.location: return (u.location, "")
  let loc = u.location.split(":")
  let url = if loc.len > 1: "/search?q=place:" & loc[1] else: ""
  (loc[0], url)

proc getSuspended*(username: string): string =
  &"User \"{username}\" has been suspended"
Add license headers Closes #413 2021-12-27 01:37:38 +00:00			`# SPDX-License-Identifier: AGPL-3.0-only`
Fix Twitter link replacements Fixes #492 2021-12-30 04:11:05 +00:00			`import strutils, strformat, times, uri, tables, xmltree, htmlparser, htmlgen`
Initial commit 2019-06-20 14:16:20 +00:00			`import regex`
Show Twitter link on search pages 2019-10-08 13:07:10 +00:00			`import types, utils, query`
Initial commit 2019-06-20 14:16:20 +00:00
			`const`
Fix incorrect regex Fixes #109 2020-01-19 07:49:20 +00:00			`ytRegex = re"([A-z.]+\.)?youtu(be\.com\|\.be)"`
Fix unescaped dot in Instagram regex (#471) Similar to edb37511813ba803568a71de997031ed79ad5329 (#109) 2021-11-26 21:49:44 +00:00			`igRegex = re"(www\.)?instagram\.com"`
Add Reddit link replacement support Closes #306 Closes #353 2021-12-27 01:13:05 +00:00
			`rdRegex = re"(?<![.b])((www\|np\|new\|amp\|old)\.)?reddit.com"`
			`rdShortRegex = re"(?<![.b])redd\.it\/"`
			`# Videos cannot be supported uniformly between Teddit and Libreddit,`
			`# so v.redd.it links will not be replaced.`
			`# Images aren't supported due to errors from Teddit when the image`
			`# wasn't first displayed via a post on the Teddit instance.`

Improve Twitter regex 2021-12-28 06:01:52 +00:00			`twRegex = re"(?<=(?<!\S)https:\/\/\|(?<=\s))(www\.\|mobile\.)?twitter\.com"`
Fix Twitter link replacements Fixes #492 2021-12-30 04:11:05 +00:00			`twLinkRegex = re"""<a href="https:\/\/twitter.com([^"]+)">twitter\.com(\S+)</a>"""`

Fix card links 2020-03-08 23:33:52 +00:00			`cards = "cards.twitter.com/cards"`
Add t.co and /cards link resolvers For t.co links: https://t.co/.. -> nitter.net/t.co/.. For card links: https://cards.twitter.com/cards/.. -> nitter.net/cards/... 2019-12-30 10:41:09 +00:00			`tco = "https://t.co"`
Add simple emoji support 2019-06-25 00:38:18 +00:00
Turn regex patterns into consts 2020-01-22 12:04:35 +00:00			`wwwRegex = re"https?://(www[0-9]?\.)?"`
Optional base64 support for proxy urls 2020-06-09 13:04:38 +00:00			`m3u8Regex = re"""url="(.+.m3u8)""""`
Hack in support for the new twitter video format (#381) This change adds ".m4s" and ".mp4" to the regex that modifies m3u8 playlist files proxied from twitter, and adds ".m4s" to the list of extensions proxied through nitter. The net effect is the new video format that twitter is using now should be supported. 2021-05-08 16:05:31 +00:00			`manifestRegex = re"\/(.+(.ts\|.m4s\|.m3u8\|.vmap\|.mp4))"`
Style fixes 2022-01-06 02:57:14 +00:00			`userPicRegex = re"_(normal\|bigger\|mini\|200x200\|400x400)(\.[A-z]+)$"`
Turn regex patterns into consts 2020-01-22 12:04:35 +00:00			`extRegex = re"(\.[A-z]+)$"`
Sanitize XML to remove invalid characters Fixes #268 2020-11-07 22:53:49 +00:00			`illegalXmlRegex = re"[^\x09\x0A\x0D\x20-\uD7FF\uE000-\uFFFD\u10000-\u10FFFF]"`
Initial commit 2019-06-20 14:16:20 +00:00
Partial fix for wrong multi-user twitter link 2020-06-17 12:15:13 +00:00			`twitter = parseUri("https://twitter.com")`

Add proper http support Fixes #223 2021-01-08 01:25:43 +00:00			`proc getUrlPrefix*(cfg: Config): string =`
Minor code improvements 2021-12-30 03:18:40 +00:00			`if cfg.useHttps: https & cfg.hostname`
Add proper http support Fixes #223 2021-01-08 01:25:43 +00:00			`else: "http://" & cfg.hostname`

Minor cleanup, fix empty lines before card links 2019-10-10 15:47:02 +00:00			`proc stripHtml*(text: string): string =`
Unshortify links when stripping html 2019-10-11 17:20:40 +00:00			`var html = parseHtml(text)`
			`for el in html.findAll("a"):`
			`let link = el.attr("href")`
			`if "http" in link:`
Misc. changes 2020-06-01 00:25:39 +00:00			`if el.len == 0: continue`
Unshortify links when stripping html 2019-10-11 17:20:40 +00:00			`el[0].text = link`
Minor cleanup, fix empty lines before card links 2019-10-10 15:47:02 +00:00			`html.innerText()`

Sanitize XML to remove invalid characters Fixes #268 2020-11-07 22:53:49 +00:00			`proc sanitizeXml*(text: string): string =`
			`text.replace(illegalXmlRegex, "")`

Initial commit 2019-06-20 14:16:20 +00:00			`proc shortLink*(text: string; length=28): string =`
Turn regex patterns into consts 2020-01-22 12:04:35 +00:00			`result = text.replace(wwwRegex, "")`
Initial commit 2019-06-20 14:16:20 +00:00			`if result.len > length:`
			`result = result[0 ..< length] & "…"`

Rename replaceUrl to replaceUrls 2021-12-27 01:27:49 +00:00			`proc replaceUrls*(body: string; prefs: Prefs; absolute=""): string =`
			`result = body`
Avoid unnecessary string allocations in replaceUrl 2021-12-26 23:42:52 +00:00
			`if prefs.replaceYouTube.len > 0 and ytRegex in result:`
Include 'www.' in twitter/youtube link replacement 2019-08-15 17:19:21 +00:00			`result = result.replace(ytRegex, prefs.replaceYouTube)`
Fix converted youtube channel links 2020-03-08 23:47:00 +00:00			`if prefs.replaceYouTube in result:`
			`result = result.replace("/c/", "/")`
Avoid unnecessary string allocations in replaceUrl 2021-12-26 23:42:52 +00:00
			`if prefs.replaceTwitter.len > 0 and`
Fix Twitter link replacements Fixes #492 2021-12-30 04:11:05 +00:00			`(twRegex in result or twLinkRegex in result or tco in result):`
Minor code improvements 2021-12-30 03:18:40 +00:00			`result = result.replace(tco, https & prefs.replaceTwitter & "/t.co")`
Fix card links 2020-03-08 23:33:52 +00:00			`result = result.replace(cards, prefs.replaceTwitter & "/cards")`
			`result = result.replace(twRegex, prefs.replaceTwitter)`
Fix Twitter link replacements Fixes #492 2021-12-30 04:11:05 +00:00			`result = result.replace(twLinkRegex, a(`
			`prefs.replaceTwitter & "$2", href = https & prefs.replaceTwitter & "$1"))`
Avoid unnecessary string allocations in replaceUrl 2021-12-26 23:42:52 +00:00
Add Reddit link replacement support Closes #306 Closes #353 2021-12-27 01:13:05 +00:00			`if prefs.replaceReddit.len > 0 and (rdRegex in result or "redd.it" in result):`
			`result = result.replace(rdShortRegex, prefs.replaceReddit & "/comments/")`
			`result = result.replace(rdRegex, prefs.replaceReddit)`
			`if prefs.replaceReddit in result and "/gallery/" in result:`
			`result = result.replace("/gallery/", "/comments/")`

Avoid unnecessary string allocations in replaceUrl 2021-12-26 23:42:52 +00:00			`if prefs.replaceInstagram.len > 0 and igRegex in result:`
			`result = result.replace(igRegex, prefs.replaceInstagram)`

			`if absolute.len > 0 and "href" in result:`
Add proper http support Fixes #223 2021-01-08 01:25:43 +00:00			`result = result.replace("href=\"/", "href=\"" & absolute & "/")`
Initial commit 2019-06-20 14:16:20 +00:00
Optional base64 support for proxy urls 2020-06-09 13:04:38 +00:00			`proc getM3u8Url*(content: string): string =`
			`var m: RegexMatch`
Revert "Use match instead of find, minor performance gain" This reverts commit 759728e3630036c7771a74794f19b52c2ff17597. 2021-12-26 23:10:42 +00:00			`if content.find(m3u8Regex, m):`
Optional base64 support for proxy urls 2020-06-09 13:04:38 +00:00			`result = content[m.group(0)[0]]`

Add video proxy support 2019-08-19 18:53:47 +00:00			`proc proxifyVideo*(manifest: string; proxy: bool): string =`
			`proc cb(m: RegexMatch; s: string): string =`
Hack in support for the new twitter video format (#381) This change adds ".m4s" and ".mp4" to the regex that modifies m3u8 playlist files proxied from twitter, and adds ".m4s" to the list of extensions proxied through nitter. The net effect is the new video format that twitter is using now should be supported. 2021-05-08 16:05:31 +00:00			`result = "https://video.twimg.com/" & s[m.group(0)[0]]`
Restrict image/gif media host instead of hashing 2019-09-13 10:27:04 +00:00			`if proxy: result = getVidUrl(result)`
Turn regex patterns into consts 2020-01-22 12:04:35 +00:00			`result = manifest.replace(manifestRegex, cb)`
Add video proxy support 2019-08-19 18:53:47 +00:00
Style fixes 2022-01-06 02:57:14 +00:00			`proc getUserPic*(userPic: string; style=""): string =`
			`let pic = userPic.replace(userPicRegex, "$2")`
Turn regex patterns into consts 2020-01-22 12:04:35 +00:00			`pic.replace(extRegex, style & "$1")`
Initial commit 2019-06-20 14:16:20 +00:00
Style fixes 2022-01-06 02:57:14 +00:00			`proc getUserPic*(profile: Profile; style=""): string =`
			`getUserPic(profile.userPic, style)`
Initial commit 2019-06-20 14:16:20 +00:00
Change ID types to int64 2019-12-09 23:39:12 +00:00			`proc getVideoEmbed*(cfg: Config; id: int64): string =`
Add proper http support Fixes #223 2021-01-08 01:25:43 +00:00			`&"{getUrlPrefix(cfg)}/i/videos/{id}"`
Implement link previews 2019-08-07 20:02:19 +00:00
Add dynamic page title 2019-06-24 20:40:48 +00:00			`proc pageTitle*(profile: Profile): string =`
Add server config file 2019-07-31 00:15:43 +00:00			`&"{profile.fullname} (@{profile.username})"`
Ensure correct text formatting 2019-06-25 02:52:38 +00:00
Add tweet page titles Fixes #124 2020-03-29 07:15:05 +00:00			`proc pageTitle*(tweet: Tweet): string =`
			`&"{pageTitle(tweet.profile)}: \"{stripHtml(tweet.text)}\""`

Implement link previews 2019-08-07 20:02:19 +00:00			`proc pageDesc*(profile: Profile): string =`
Display profile bio in preview 2019-10-11 16:43:47 +00:00			`if profile.bio.len > 0:`
			`stripHtml(profile.bio)`
			`else:`
			`"The latest tweets from " & profile.fullname`
Implement link previews 2019-08-07 20:02:19 +00:00
Revamp profile api to display more metadata 2019-08-11 19:26:55 +00:00			`proc getJoinDate*(profile: Profile): string =`
			`profile.joinDate.format("'Joined' MMMM YYYY")`

			`proc getJoinDateFull*(profile: Profile): string =`
			`profile.joinDate.format("h:mm tt - d MMM YYYY")`

Ensure correct text formatting 2019-06-25 02:52:38 +00:00			`proc getTime*(tweet: Tweet): string =`
Rearrange date string 2022-01-03 02:52:39 +00:00			`tweet.time.format("MMM d', 'YYYY' · 'h:mm tt' UTC'")`
Improve RSS validity 2019-09-15 09:14:03 +00:00
			`proc getRfc822Time*(tweet: Tweet): string =`
Changed procedure getRfc822Time to comply with RSS 2.0 spec (#404) Co-authored-by: David Robinson <daveed@mailbox.org> 2021-06-23 21:15:51 +00:00			`tweet.time.format("ddd', 'dd MMM yyyy HH:mm:ss 'GMT'")`
Generate tweet links 2019-07-01 21:14:36 +00:00
Misc. changes 2020-06-01 00:25:39 +00:00			`proc getShortTime*(tweet: Tweet): string =`
Preserve original UTC timestamp 2020-06-02 19:06:44 +00:00			`let now = now()`
Fix compiler warnings 2021-12-20 02:11:12 +00:00			`let since = now - tweet.time`
Preserve original UTC timestamp 2020-06-02 19:06:44 +00:00
Fix compiler warnings 2021-12-20 02:11:12 +00:00			`if now.year != tweet.time.year:`
Misc. changes 2020-06-01 00:25:39 +00:00			`result = tweet.time.format("d MMM yyyy")`
			`elif since.inDays >= 1:`
			`result = tweet.time.format("MMM d")`
			`elif since.inHours >= 1:`
			`result = $since.inHours & "h"`
			`elif since.inMinutes >= 1:`
			`result = $since.inMinutes & "m"`
			`elif since.inSeconds > 1:`
			`result = $since.inSeconds & "s"`
			`else:`
			`result = "now"`

			`proc getLink*(tweet: Tweet; focus=true): string =`
Use int for tweet ids for correct thread sorting 2019-10-10 16:22:14 +00:00			`if tweet.id == 0: return`
Misc. changes 2020-06-01 00:25:39 +00:00			`var username = tweet.profile.username`
			`if username.len == 0:`
			`username = "i"`
			`result = &"/{username}/status/{tweet.id}"`
Focus main tweet in threads 2019-10-22 07:17:58 +00:00			`if focus: result &= "#m"`
Show reasons for tweets being withheld Fixes #33 2019-09-08 12:34:26 +00:00
Show Twitter link on search pages 2019-10-08 13:07:10 +00:00			`proc getTwitterLink*(path: string; params: Table[string, string]): string =`
Partial fix for wrong multi-user twitter link 2020-06-17 12:15:13 +00:00			`var`
Show Twitter link on search pages 2019-10-08 13:07:10 +00:00			`username = params.getOrDefault("name")`
			`query = initQuery(params, username)`
Partial fix for wrong multi-user twitter link 2020-06-17 12:15:13 +00:00			`path = path`

			`if "," in username:`
			`query.fromUser = username.split(",")`
			`path = "/search"`
Show Twitter link on search pages 2019-10-08 13:07:10 +00:00
Partial fix for wrong multi-user twitter link 2020-06-17 12:15:13 +00:00			`if "/search" notin path and query.fromUser.len < 2:`
Add canonical header to help search engines Fixes #472 2021-12-30 02:59:11 +00:00			`return $(twitter / path)`
Show Twitter link on search pages 2019-10-08 13:07:10 +00:00
			`let p = {`
Fix Twitter link for searches 2020-06-02 20:31:46 +00:00			`"f": if query.kind == users: "user" else: "live",`
Show Twitter link on search pages 2019-10-08 13:07:10 +00:00			`"q": genQueryParam(query),`
Misc. changes 2020-06-01 00:25:39 +00:00			`"src": "typed_query"`
Show Twitter link on search pages 2019-10-08 13:07:10 +00:00			`}`

Partial fix for wrong multi-user twitter link 2020-06-17 12:15:13 +00:00			`result = $(twitter / path ? p)`
Show Twitter link on search pages 2019-10-08 13:07:10 +00:00			`if username.len > 0:`
			`result = result.replace("/" & username, "")`
Support tweet locations 2019-12-21 04:44:58 +00:00
			`proc getLocation*(u: Profile \| Tweet): (string, string) =`
Fix displaying urls in location fields 2020-03-09 00:03:24 +00:00			`if "://" in u.location: return (u.location, "")`
Support tweet locations 2019-12-21 04:44:58 +00:00			`let loc = u.location.split(":")`
			`let url = if loc.len > 1: "/search?q=place:" & loc[1] else: ""`
			`(loc[0], url)`
Detect suspended accounts 2020-04-14 21:56:31 +00:00
			`proc getSuspended*(username: string): string =`
			`&"User \"{username}\" has been suspended"`