nitter/src/formatters.nim

# SPDX-License-Identifier: AGPL-3.0-only
import strutils, strformat, times, uri, tables, xmltree, htmlparser, htmlgen
import std/[enumerate, re]
import types, utils, query

const
  cards = "cards.twitter.com/cards"
  tco = "https://t.co"
  twitter = parseUri("https://twitter.com")

let
  twRegex = re"(?<=(?<!\S)https:\/\/|(?<=\s))(www\.|mobile\.)?twitter\.com"
  twLinkRegex = re"""<a href="https:\/\/twitter.com([^"]+)">twitter\.com(\S+)</a>"""

  ytRegex = re"([A-z.]+\.)?youtu(be\.com|\.be)"
  igRegex = re"(www\.)?instagram\.com"

  rdRegex = re"(?<![.b])((www|np|new|amp|old)\.)?reddit.com"
  rdShortRegex = re"(?<![.b])redd\.it\/"
  # Videos cannot be supported uniformly between Teddit and Libreddit,
  # so v.redd.it links will not be replaced.
  # Images aren't supported due to errors from Teddit when the image
  # wasn't first displayed via a post on the Teddit instance.

  wwwRegex = re"https?://(www[0-9]?\.)?"
  m3u8Regex = re"""url="(.+.m3u8)""""
  userPicRegex = re"_(normal|bigger|mini|200x200|400x400)(\.[A-z]+)$"
  extRegex = re"(\.[A-z]+)$"
  illegalXmlRegex = re"(*UTF8)[^\x09\x0A\x0D\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]"

proc getUrlPrefix*(cfg: Config): string =
  if cfg.useHttps: https & cfg.hostname
  else: "http://" & cfg.hostname

proc shortLink*(text: string; length=28): string =
  result = text.replace(wwwRegex, "")
  if result.len > length:
    result = result[0 ..< length] & "…"
    
proc stripHtml*(text: string; shorten=false): string =
  var html = parseHtml(text)
  for el in html.findAll("a"):
    let link = el.attr("href")
    if "http" in link:
      if el.len == 0: continue
      el[0].text =
        if shorten: link.shortLink
        else: link
  html.innerText()

proc sanitizeXml*(text: string): string =
  text.replace(illegalXmlRegex, "")

proc replaceUrls*(body: string; prefs: Prefs; absolute=""): string =
  result = body

  if prefs.replaceYouTube.len > 0 and "youtu" in result:
    result = result.replace(ytRegex, prefs.replaceYouTube)
    if prefs.replaceYouTube in result:
      result = result.replace("/c/", "/")

  if prefs.replaceTwitter.len > 0 and ("twitter.com" in body or tco in body):
    result = result.replace(tco, https & prefs.replaceTwitter & "/t.co")
    result = result.replace(cards, prefs.replaceTwitter & "/cards")
    result = result.replace(twRegex, prefs.replaceTwitter)
    result = result.replacef(twLinkRegex, a(
      prefs.replaceTwitter & "$2", href = https & prefs.replaceTwitter & "$1"))

  if prefs.replaceReddit.len > 0 and ("reddit.com" in result or "redd.it" in result):
    result = result.replace(rdShortRegex, prefs.replaceReddit & "/comments/")
    result = result.replace(rdRegex, prefs.replaceReddit)
    if prefs.replaceReddit in result and "/gallery/" in result:
      result = result.replace("/gallery/", "/comments/")

  if prefs.replaceInstagram.len > 0 and "instagram.com" in result:
    result = result.replace(igRegex, prefs.replaceInstagram)

  if absolute.len > 0 and "href" in result:
    result = result.replace("href=\"/", "href=\"" & absolute & "/")

proc getM3u8Url*(content: string): string =
  var matches: array[1, string]
  if re.find(content, m3u8Regex, matches) != -1:
    result = matches[0]

proc proxifyVideo*(manifest: string; proxy: bool): string =
  var replacements: seq[(string, string)]
  for line in manifest.splitLines:
    let url =
      if line.startsWith("#EXT-X-MAP:URI"): line[16 .. ^2]
      else: line
    if url.startsWith('/'):
      let path = "https://video.twimg.com" & url
      replacements.add (url, if proxy: path.getVidUrl else: path)
  return manifest.multiReplace(replacements)

proc getUserPic*(userPic: string; style=""): string =
  userPic.replacef(userPicRegex, "$2").replacef(extRegex, style & "$1")

proc getUserPic*(user: User; style=""): string =
  getUserPic(user.userPic, style)

proc getVideoEmbed*(cfg: Config; id: int64): string =
  &"{getUrlPrefix(cfg)}/i/videos/{id}"

proc pageTitle*(user: User): string =
  &"{user.fullname} (@{user.username})"

proc pageTitle*(tweet: Tweet): string =
  &"{pageTitle(tweet.user)}: \"{stripHtml(tweet.text)}\""

proc pageDesc*(user: User): string =
  if user.bio.len > 0:
    stripHtml(user.bio)
  else:
    "The latest tweets from " & user.fullname

proc getJoinDate*(user: User): string =
  user.joinDate.format("'Joined' MMMM YYYY")

proc getJoinDateFull*(user: User): string =
  user.joinDate.format("h:mm tt - d MMM YYYY")

proc getTime*(tweet: Tweet): string =
  tweet.time.format("MMM d', 'YYYY' · 'h:mm tt' UTC'")

proc getRfc822Time*(tweet: Tweet): string =
  tweet.time.format("ddd', 'dd MMM yyyy HH:mm:ss 'GMT'")

proc getShortTime*(tweet: Tweet): string =
  let now = now()
  let since = now - tweet.time

  if now.year != tweet.time.year:
    result = tweet.time.format("d MMM yyyy")
  elif since.inDays >= 1:
    result = tweet.time.format("MMM d")
  elif since.inHours >= 1:
    result = $since.inHours & "h"
  elif since.inMinutes >= 1:
    result = $since.inMinutes & "m"
  elif since.inSeconds > 1:
    result = $since.inSeconds & "s"
  else:
    result = "now"

proc getLink*(tweet: Tweet; focus=true): string =
  if tweet.id == 0: return
  var username = tweet.user.username
  if username.len == 0:
    username = "i"
  result = &"/{username}/status/{tweet.id}"
  if focus: result &= "#m"

proc getTwitterLink*(path: string; params: Table[string, string]): string =
  var
    username = params.getOrDefault("name")
    query = initQuery(params, username)
    path = path

  if "," in username:
    query.fromUser = username.split(",")
    path = "/search"

  if "/search" notin path and query.fromUser.len < 2:
    return $(twitter / path)

  let p = {
    "f": if query.kind == users: "user" else: "live",
    "q": genQueryParam(query),
    "src": "typed_query"
  }

  result = $(twitter / path ? p)
  if username.len > 0:
    result = result.replace("/" & username, "")

proc getLocation*(u: User | Tweet): (string, string) =
  if "://" in u.location: return (u.location, "")
  let loc = u.location.split(":")
  let url = if loc.len > 1: "/search?q=place:" & loc[1] else: ""
  (loc[0], url)

proc getSuspended*(username: string): string =
  &"User \"{username}\" has been suspended"

proc titleize*(str: string): string =
  const
    lowercase = {'a'..'z'}
    delims = {' ', '('}

  result = str
  for i, c in enumerate(str):
    if c in lowercase and (i == 0 or str[i - 1] in delims):
      result[i] = c.toUpperAscii
Add license headers Closes #413 2021-12-27 02:37:38 +01:00			`# SPDX-License-Identifier: AGPL-3.0-only`
Fix Twitter link replacements Fixes #492 2021-12-30 05:11:05 +01:00			`import strutils, strformat, times, uri, tables, xmltree, htmlparser, htmlgen`
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`import std/[enumerate, re]`
Show Twitter link on search pages 2019-10-08 15:07:10 +02:00			`import types, utils, query`
Initial commit 2019-06-20 16:16:20 +02:00
			`const`
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`cards = "cards.twitter.com/cards"`
			`tco = "https://t.co"`
			`twitter = parseUri("https://twitter.com")`

			`let`
			`twRegex = re"(?<=(?<!\S)https:\/\/\|(?<=\s))(www\.\|mobile\.)?twitter\.com"`
			`twLinkRegex = re"""<a href="https:\/\/twitter.com([^"]+)">twitter\.com(\S+)</a>"""`

Fix incorrect regex Fixes #109 2020-01-19 08:49:20 +01:00			`ytRegex = re"([A-z.]+\.)?youtu(be\.com\|\.be)"`
Fix unescaped dot in Instagram regex (#471) Similar to edb37511813ba803568a71de997031ed79ad5329 (#109) 2021-11-26 22:49:44 +01:00			`igRegex = re"(www\.)?instagram\.com"`
Add Reddit link replacement support Closes #306 Closes #353 2021-12-27 02:13:05 +01:00
			`rdRegex = re"(?<![.b])((www\|np\|new\|amp\|old)\.)?reddit.com"`
			`rdShortRegex = re"(?<![.b])redd\.it\/"`
			`# Videos cannot be supported uniformly between Teddit and Libreddit,`
			`# so v.redd.it links will not be replaced.`
			`# Images aren't supported due to errors from Teddit when the image`
			`# wasn't first displayed via a post on the Teddit instance.`

Turn regex patterns into consts 2020-01-22 13:04:35 +01:00			`wwwRegex = re"https?://(www[0-9]?\.)?"`
Optional base64 support for proxy urls 2020-06-09 15:04:38 +02:00			`m3u8Regex = re"""url="(.+.m3u8)""""`
Style fixes 2022-01-06 03:57:14 +01:00			`userPicRegex = re"_(normal\|bigger\|mini\|200x200\|400x400)(\.[A-z]+)$"`
Turn regex patterns into consts 2020-01-22 13:04:35 +01:00			`extRegex = re"(\.[A-z]+)$"`
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`illegalXmlRegex = re"(*UTF8)[^\x09\x0A\x0D\x20-\x{D7FF}\x{E000}-\x{FFFD}\x{10000}-\x{10FFFF}]"`
Partial fix for wrong multi-user twitter link 2020-06-17 14:15:13 +02:00
Add proper http support Fixes #223 2021-01-08 02:25:43 +01:00			`proc getUrlPrefix*(cfg: Config): string =`
Minor code improvements 2021-12-30 04:18:40 +01:00			`if cfg.useHttps: https & cfg.hostname`
Add proper http support Fixes #223 2021-01-08 02:25:43 +01:00			`else: "http://" & cfg.hostname`

changed code to be not shit 2022-01-14 18:01:47 +01:00			`proc shortLink*(text: string; length=28): string =`
			`result = text.replace(wwwRegex, "")`
			`if result.len > length:`
			`result = result[0 ..< length] & "…"`

			`proc stripHtml*(text: string; shorten=false): string =`
Unshortify links when stripping html 2019-10-11 19:20:40 +02:00			`var html = parseHtml(text)`
			`for el in html.findAll("a"):`
			`let link = el.attr("href")`
			`if "http" in link:`
Misc. changes 2020-06-01 02:25:39 +02:00			`if el.len == 0: continue`
changed code to be not shit 2022-01-14 18:01:47 +01:00			`el[0].text =`
			`if shorten: link.shortLink`
			`else: link`
Minor cleanup, fix empty lines before card links 2019-10-10 17:47:02 +02:00			`html.innerText()`

Sanitize XML to remove invalid characters Fixes #268 2020-11-07 23:53:49 +01:00			`proc sanitizeXml*(text: string): string =`
			`text.replace(illegalXmlRegex, "")`

Rename replaceUrl to replaceUrls 2021-12-27 02:27:49 +01:00			`proc replaceUrls*(body: string; prefs: Prefs; absolute=""): string =`
			`result = body`
Avoid unnecessary string allocations in replaceUrl 2021-12-27 00:42:52 +01:00
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`if prefs.replaceYouTube.len > 0 and "youtu" in result:`
Include 'www.' in twitter/youtube link replacement 2019-08-15 19:19:21 +02:00			`result = result.replace(ytRegex, prefs.replaceYouTube)`
Fix converted youtube channel links 2020-03-09 00:47:00 +01:00			`if prefs.replaceYouTube in result:`
			`result = result.replace("/c/", "/")`
Avoid unnecessary string allocations in replaceUrl 2021-12-27 00:42:52 +01:00
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`if prefs.replaceTwitter.len > 0 and ("twitter.com" in body or tco in body):`
Minor code improvements 2021-12-30 04:18:40 +01:00			`result = result.replace(tco, https & prefs.replaceTwitter & "/t.co")`
Fix card links 2020-03-09 00:33:52 +01:00			`result = result.replace(cards, prefs.replaceTwitter & "/cards")`
			`result = result.replace(twRegex, prefs.replaceTwitter)`
Add experimental support for unified_card Closes #345 2022-01-13 00:36:30 +01:00			`result = result.replacef(twLinkRegex, a(`
Fix Twitter link replacements Fixes #492 2021-12-30 05:11:05 +01:00			`prefs.replaceTwitter & "$2", href = https & prefs.replaceTwitter & "$1"))`
Avoid unnecessary string allocations in replaceUrl 2021-12-27 00:42:52 +01:00
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`if prefs.replaceReddit.len > 0 and ("reddit.com" in result or "redd.it" in result):`
Add Reddit link replacement support Closes #306 Closes #353 2021-12-27 02:13:05 +01:00			`result = result.replace(rdShortRegex, prefs.replaceReddit & "/comments/")`
			`result = result.replace(rdRegex, prefs.replaceReddit)`
			`if prefs.replaceReddit in result and "/gallery/" in result:`
			`result = result.replace("/gallery/", "/comments/")`

Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`if prefs.replaceInstagram.len > 0 and "instagram.com" in result:`
Avoid unnecessary string allocations in replaceUrl 2021-12-27 00:42:52 +01:00			`result = result.replace(igRegex, prefs.replaceInstagram)`

			`if absolute.len > 0 and "href" in result:`
Add proper http support Fixes #223 2021-01-08 02:25:43 +01:00			`result = result.replace("href=\"/", "href=\"" & absolute & "/")`
Initial commit 2019-06-20 16:16:20 +02:00
Optional base64 support for proxy urls 2020-06-09 15:04:38 +02:00			`proc getM3u8Url*(content: string): string =`
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`var matches: array[1, string]`
			`if re.find(content, m3u8Regex, matches) != -1:`
			`result = matches[0]`
Optional base64 support for proxy urls 2020-06-09 15:04:38 +02:00
Add video proxy support 2019-08-19 20:53:47 +02:00			`proc proxifyVideo*(manifest: string; proxy: bool): string =`
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`var replacements: seq[(string, string)]`
			`for line in manifest.splitLines:`
			`let url =`
			`if line.startsWith("#EXT-X-MAP:URI"): line[16 .. ^2]`
			`else: line`
Fix video processing crash Closes #512 2022-01-12 19:19:14 +01:00			`if url.startsWith('/'):`
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`let path = "https://video.twimg.com" & url`
			`replacements.add (url, if proxy: path.getVidUrl else: path)`
			`return manifest.multiReplace(replacements)`
Add video proxy support 2019-08-19 20:53:47 +02:00
Style fixes 2022-01-06 03:57:14 +01:00			`proc getUserPic*(userPic: string; style=""): string =`
Remove nim-regex dependency, improve performance 2022-01-11 03:10:42 +01:00			`userPic.replacef(userPicRegex, "$2").replacef(extRegex, style & "$1")`
Initial commit 2019-06-20 16:16:20 +02:00
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`proc getUserPic*(user: User; style=""): string =`
			`getUserPic(user.userPic, style)`
Initial commit 2019-06-20 16:16:20 +02:00
Change ID types to int64 2019-12-10 00:39:12 +01:00			`proc getVideoEmbed*(cfg: Config; id: int64): string =`
Add proper http support Fixes #223 2021-01-08 02:25:43 +01:00			`&"{getUrlPrefix(cfg)}/i/videos/{id}"`
Implement link previews 2019-08-07 22:02:19 +02:00
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`proc pageTitle*(user: User): string =`
			`&"{user.fullname} (@{user.username})"`
Ensure correct text formatting 2019-06-25 04:52:38 +02:00
Add tweet page titles Fixes #124 2020-03-29 09:15:05 +02:00			`proc pageTitle*(tweet: Tweet): string =`
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`&"{pageTitle(tweet.user)}: \"{stripHtml(tweet.text)}\""`
Add tweet page titles Fixes #124 2020-03-29 09:15:05 +02:00
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`proc pageDesc*(user: User): string =`
			`if user.bio.len > 0:`
			`stripHtml(user.bio)`
Display profile bio in preview 2019-10-11 18:43:47 +02:00			`else:`
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`"The latest tweets from " & user.fullname`
Implement link previews 2019-08-07 22:02:19 +02:00
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`proc getJoinDate*(user: User): string =`
			`user.joinDate.format("'Joined' MMMM YYYY")`
Revamp profile api to display more metadata 2019-08-11 21:26:55 +02:00
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`proc getJoinDateFull*(user: User): string =`
			`user.joinDate.format("h:mm tt - d MMM YYYY")`
Revamp profile api to display more metadata 2019-08-11 21:26:55 +02:00
Ensure correct text formatting 2019-06-25 04:52:38 +02:00			`proc getTime*(tweet: Tweet): string =`
Rearrange date string 2022-01-03 03:52:39 +01:00			`tweet.time.format("MMM d', 'YYYY' · 'h:mm tt' UTC'")`
Improve RSS validity 2019-09-15 11:14:03 +02:00
			`proc getRfc822Time*(tweet: Tweet): string =`
Changed procedure getRfc822Time to comply with RSS 2.0 spec (#404) Co-authored-by: David Robinson <daveed@mailbox.org> 2021-06-23 23:15:51 +02:00			`tweet.time.format("ddd', 'dd MMM yyyy HH:mm:ss 'GMT'")`
Generate tweet links 2019-07-01 23:14:36 +02:00
Misc. changes 2020-06-01 02:25:39 +02:00			`proc getShortTime*(tweet: Tweet): string =`
Preserve original UTC timestamp 2020-06-02 21:06:44 +02:00			`let now = now()`
Fix compiler warnings 2021-12-20 03:11:12 +01:00			`let since = now - tweet.time`
Preserve original UTC timestamp 2020-06-02 21:06:44 +02:00
Fix compiler warnings 2021-12-20 03:11:12 +01:00			`if now.year != tweet.time.year:`
Misc. changes 2020-06-01 02:25:39 +02:00			`result = tweet.time.format("d MMM yyyy")`
			`elif since.inDays >= 1:`
			`result = tweet.time.format("MMM d")`
			`elif since.inHours >= 1:`
			`result = $since.inHours & "h"`
			`elif since.inMinutes >= 1:`
			`result = $since.inMinutes & "m"`
			`elif since.inSeconds > 1:`
			`result = $since.inSeconds & "s"`
			`else:`
			`result = "now"`

			`proc getLink*(tweet: Tweet; focus=true): string =`
Use int for tweet ids for correct thread sorting 2019-10-10 18:22:14 +02:00			`if tweet.id == 0: return`
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`var username = tweet.user.username`
Misc. changes 2020-06-01 02:25:39 +02:00			`if username.len == 0:`
			`username = "i"`
			`result = &"/{username}/status/{tweet.id}"`
Focus main tweet in threads 2019-10-22 09:17:58 +02:00			`if focus: result &= "#m"`
Show reasons for tweets being withheld Fixes #33 2019-09-08 14:34:26 +02:00
Show Twitter link on search pages 2019-10-08 15:07:10 +02:00			`proc getTwitterLink*(path: string; params: Table[string, string]): string =`
Partial fix for wrong multi-user twitter link 2020-06-17 14:15:13 +02:00			`var`
Show Twitter link on search pages 2019-10-08 15:07:10 +02:00			`username = params.getOrDefault("name")`
			`query = initQuery(params, username)`
Partial fix for wrong multi-user twitter link 2020-06-17 14:15:13 +02:00			`path = path`

			`if "," in username:`
			`query.fromUser = username.split(",")`
			`path = "/search"`
Show Twitter link on search pages 2019-10-08 15:07:10 +02:00
Partial fix for wrong multi-user twitter link 2020-06-17 14:15:13 +02:00			`if "/search" notin path and query.fromUser.len < 2:`
Add canonical header to help search engines Fixes #472 2021-12-30 03:59:11 +01:00			`return $(twitter / path)`
Show Twitter link on search pages 2019-10-08 15:07:10 +02:00
			`let p = {`
Fix Twitter link for searches 2020-06-02 22:31:46 +02:00			`"f": if query.kind == users: "user" else: "live",`
Show Twitter link on search pages 2019-10-08 15:07:10 +02:00			`"q": genQueryParam(query),`
Misc. changes 2020-06-01 02:25:39 +02:00			`"src": "typed_query"`
Show Twitter link on search pages 2019-10-08 15:07:10 +02:00			`}`

Partial fix for wrong multi-user twitter link 2020-06-17 14:15:13 +02:00			`result = $(twitter / path ? p)`
Show Twitter link on search pages 2019-10-08 15:07:10 +02:00			`if username.len > 0:`
			`result = result.replace("/" & username, "")`
Support tweet locations 2019-12-21 05:44:58 +01:00
Rearchitect profile, support pins, Profile -> User 2022-01-23 07:04:50 +01:00			`proc getLocation*(u: User \| Tweet): (string, string) =`
Fix displaying urls in location fields 2020-03-09 01:03:24 +01:00			`if "://" in u.location: return (u.location, "")`
Support tweet locations 2019-12-21 05:44:58 +01:00			`let loc = u.location.split(":")`
			`let url = if loc.len > 1: "/search?q=place:" & loc[1] else: ""`
			`(loc[0], url)`
Detect suspended accounts 2020-04-14 23:56:31 +02:00
			`proc getSuspended*(username: string): string =`
			`&"User \"{username}\" has been suspended"`
Reimplement titleize without regex 2022-01-10 16:18:10 +01:00
			`proc titleize*(str: string): string =`
			`const`
			`lowercase = {'a'..'z'}`
			`delims = {' ', '('}`

			`result = str`
			`for i, c in enumerate(str):`
			`if c in lowercase and (i == 0 or str[i - 1] in delims):`
			`result[i] = c.toUpperAscii`