From d67ed268176884b41fbabde2f56cb25bef026224 Mon Sep 17 00:00:00 2001 From: SolitudeSF Date: Tue, 10 Nov 2020 23:33:41 +0200 Subject: [PATCH] Improve tweet url and hashtag parsing Dry Fixes --- src/parserutils.nim | 159 +++++++++++++++++++++++++++++--------------- 1 file changed, 107 insertions(+), 52 deletions(-) diff --git a/src/parserutils.nim b/src/parserutils.nim index 75d281d..58c7d2e 100644 --- a/src/parserutils.nim +++ b/src/parserutils.nim @@ -1,4 +1,4 @@ -import strutils, times, macros, htmlgen, unicode, options +import strutils, times, macros, htmlgen, unicode, options, algorithm import regex, packedjson import types, utils, formatters @@ -6,9 +6,18 @@ const unRegex = re"(^|[^A-z0-9-_./?])@([A-z0-9_]{1,15})" unReplace = "$1@$2" - htRegex = re"(^|[^\w-_./?])([#$])([\w_]+)" + htRegex = re"(^|[^\w-_./?])([##$])([\w_]+)" htReplace = "$1$2$3" +type + ReplaceSliceKind = enum + rkRemove, rkUrl, rkHashtag, rkMention + + ReplaceSlice = object + slice: Slice[int] + kind: ReplaceSliceKind + url, display: string + template isNull*(js: JsonNode): bool = js.kind == JNull template notNull*(js: JsonNode): bool = js.kind != JNull @@ -124,65 +133,92 @@ proc getTombstone*(js: JsonNode): string = result = js{"tombstoneInfo", "richText", "text"}.getStr result.removeSuffix(" Learn more") -template getSlice(text: string; slice: seq[int]): string = - text.runeSubStr(slice[0], slice[1] - slice[0]) - -proc getSlice(text: string; js: JsonNode): string = - if js.kind != JArray or js.len < 2 or js[0].kind != JInt: return text - - let slice = @[js{0}.getInt, js{1}.getInt] - text.getSlice(slice) - -proc expandUrl(text: var string; js: JsonNode; tLen: int; hideTwitter=false) = - let u = js{"url"}.getStr - if u.len == 0 or u notin text: - return +proc extractSlice(js: JsonNode): Slice[int] = + result = js["indices"][0].getInt ..< js["indices"][1].getInt +proc extractUrls(result: var seq[ReplaceSlice]; js: JsonNode; + textLen: int; hideTwitter = false) = let - url = js{"expanded_url"}.getStr - slice = js{"indices"}[1].getInt + url = js["expanded_url"].getStr + slice = js.extractSlice - if hideTwitter and slice >= tLen and url.isTwitterUrl: - text = text.replace(u, "") - text.removeSuffix(' ') - text.removeSuffix('\n') + if hideTwitter and slice.b >= textLen and url.isTwitterUrl: + if slice.a < textLen: + result.add ReplaceSlice(kind: rkRemove, slice: slice) else: - text = text.replace(u, a(shortLink(url), href=url)) + result.add ReplaceSlice(kind: rkUrl, url: url, + display: url.shortLink, slice: slice) -proc expandMention(text: var string; orig: string; js: JsonNode) = - let - name = js{"name"}.getStr - href = '/' & js{"screen_name"}.getStr - uname = orig.getSlice(js{"indices"}) - text = text.replace(uname, a(uname, href=href, title=name)) +proc extractHashtags(result: var seq[ReplaceSlice]; js: JsonNode) = + result.add ReplaceSlice(kind: rkHashtag, slice: js.extractSlice) + +proc replacedWith(runes: seq[Rune]; repls: openArray[ReplaceSlice]; + textSlice: Slice[int]): string = + template extractLowerBound(i: int; idx): int = + if i > 0: repls[idx].slice.b.succ else: textSlice.a + + result = newStringOfCap(runes.len) + for i, rep in repls: + result.add $runes[extractLowerBound(i, i - 1) ..< rep.slice.a] + case rep.kind + of rkHashtag: + let + name = $runes[rep.slice.a.succ .. rep.slice.b] + symbol = $runes[rep.slice.a] + result.add a(symbol & name, href = "/search?q=%23" & name) + of rkMention: + result.add a($runes[rep.slice], href = rep.url, title = rep.display) + of rkUrl: + result.add a(rep.display, href = rep.url) + of rkRemove: + discard + result.add $runes[extractLowerBound(repls.len, ^1) ..< textSlice.b] + +proc deduplicate(s: var seq[ReplaceSlice]) = + var + len = s.len + i = 0 + while i < len: + var j = i + 1 + while j < len: + if s[i].slice.a == s[j].slice.a: + s.del j + dec len + else: + inc j + inc i + +proc cmp(x, y: ReplaceSlice): int = cmp(x.slice.a, y.slice.b) proc expandProfileEntities*(profile: var Profile; js: JsonNode) = let - orig = profile.bio + orig = profile.bio.toRunes ent = ? js{"entities"} with urls, ent{"url", "urls"}: profile.website = urls[0]{"expanded_url"}.getStr - with urls, ent{"description", "urls"}: - for u in urls: profile.bio.expandUrl(u, orig.high) + var replacements = newSeq[ReplaceSlice]() + with urls, ent{"description", "urls"}: + for u in urls: + replacements.extractUrls(u, orig.high) + + replacements.deduplicate + replacements.sort(cmp) + + profile.bio = orig.replacedWith(replacements, 0 .. orig.len) profile.bio = profile.bio.replace(unRegex, unReplace) .replace(htRegex, htReplace) - for mention in ? ent{"user_mentions"}: - profile.bio.expandMention(orig, mention) - proc expandTweetEntities*(tweet: Tweet; js: JsonNode) = let - orig = tweet.text + orig = tweet.text.toRunes textRange = js{"display_text_range"} - slice = @[textRange{0}.getInt, textRange{1}.getInt] + textSlice = textRange{0}.getInt .. textRange{1}.getInt hasQuote = js{"is_quote_status"}.getBool hasCard = tweet.card.isSome - tweet.text = tweet.text.getSlice(slice) - var replyTo = "" if tweet.replyId != 0: with reply, js{"in_reply_to_screen_name"}: @@ -191,26 +227,45 @@ proc expandTweetEntities*(tweet: Tweet; js: JsonNode) = let ent = ? js{"entities"} + var replacements = newSeq[ReplaceSlice]() + with urls, ent{"urls"}: for u in urls: - tweet.text.expandUrl(u, slice[1], hasQuote) + let urlStr = u["url"].getStr + if urlStr.len == 0 or urlStr notin tweet.text: + continue + replacements.extractUrls(u, textSlice.b, hideTwitter = hasQuote) if hasCard and u{"url"}.getStr == get(tweet.card).url: get(tweet.card).url = u{"expanded_url"}.getStr with media, ent{"media"}: - for m in media: tweet.text.expandUrl(m, slice[1], hideTwitter=true) + for m in media: + replacements.extractUrls(m, textSlice.b, hideTwitter = true) - if "hashtags" in ent or "symbols" in ent: - tweet.text = tweet.text.replace(htRegex, htReplace) + if "hashtags" in ent: + for hashtag in ent["hashtags"]: + replacements.extractHashtags(hashtag) - for mention in ? ent{"user_mentions"}: - let - name = mention{"screen_name"}.getStr - idx = tweet.reply.find(name) + if "symbols" in ent: + for symbol in ent["symbols"]: + replacements.extractHashtags(symbol) - if mention{"indices"}[0].getInt >= slice[0]: - tweet.text.expandMention(orig, mention) - if idx > -1 and name != replyTo: - tweet.reply.delete idx - elif idx == -1 and tweet.replyId != 0: - tweet.reply.add name + if "user_mentions" in ent: + for mention in ent["user_mentions"]: + let + name = mention{"screen_name"}.getStr + slice = mention.extractSlice + idx = tweet.reply.find(name) + + if slice.a >= textSlice.a: + replacements.add ReplaceSlice(kind: rkMention, slice: slice, + url: "/" & name, display: mention["name"].getStr) + if idx > -1 and name != replyTo: + tweet.reply.delete idx + elif idx == -1 and tweet.replyId != 0: + tweet.reply.add name + + replacements.deduplicate + replacements.sort(cmp) + + tweet.text = orig.replacedWith(replacements, textSlice)