From aea884c48ef1a429790e8c401e98548d2d6de841 Mon Sep 17 00:00:00 2001 From: Zed Date: Fri, 3 Mar 2023 21:19:21 +0100 Subject: [PATCH] Deduplicate note tweet parsing --- src/parser.nim | 4 +- src/parserutils.nim | 94 +++++++++++++++------------------------------ tests/test_card.py | 2 +- 3 files changed, 35 insertions(+), 65 deletions(-) diff --git a/src/parser.nim b/src/parser.nim index 3858956..a491cea 100644 --- a/src/parser.nim +++ b/src/parser.nim @@ -204,12 +204,12 @@ proc parseTweet(js: JsonNode; jsCard: JsonNode = newJNull()): Tweet = ) ) + result.expandTweetEntities(js) + # fix for pinned threads if result.hasThread and result.threadId == 0: result.threadId = js{"self_thread", "id_str"}.getId - result.expandTweetEntities(js) - if js{"is_quote_status"}.getBool: result.quote = some Tweet(id: js{"quoted_status_id_str"}.getId) diff --git a/src/parserutils.nim b/src/parserutils.nim index 22c3d86..8ae9cd0 100644 --- a/src/parserutils.nim +++ b/src/parserutils.nim @@ -230,47 +230,37 @@ proc expandUserEntities*(user: var User; js: JsonNode) = user.bio = user.bio.replacef(unRegex, unReplace) .replacef(htRegex, htReplace) -proc expandTweetEntities*(tweet: Tweet; js: JsonNode) = - let - orig = tweet.text.toRunes - textRange = js{"display_text_range"} - textSlice = textRange{0}.getInt .. textRange{1}.getInt - hasQuote = js{"is_quote_status"}.getBool - hasCard = tweet.card.isSome - - var replyTo = "" - if tweet.replyId != 0: - with reply, js{"in_reply_to_screen_name"}: - tweet.reply.add reply.getStr - replyTo = reply.getStr - - let ent = ? js{"entities"} +proc expandTextEntities(tweet: Tweet; entities: JsonNode; text: string; textSlice: Slice[int]; + replyTo=""; hasQuote=false) = + let hasCard = tweet.card.isSome var replacements = newSeq[ReplaceSlice]() - with urls, ent{"urls"}: + with urls, entities{"urls"}: for u in urls: let urlStr = u["url"].getStr - if urlStr.len == 0 or urlStr notin tweet.text: + if urlStr.len == 0 or urlStr notin text: continue + replacements.extractUrls(u, textSlice.b, hideTwitter = hasQuote) + if hasCard and u{"url"}.getStr == get(tweet.card).url: get(tweet.card).url = u{"expanded_url"}.getStr - with media, ent{"media"}: + with media, entities{"media"}: for m in media: replacements.extractUrls(m, textSlice.b, hideTwitter = true) - if "hashtags" in ent: - for hashtag in ent["hashtags"]: + if "hashtags" in entities: + for hashtag in entities["hashtags"]: replacements.extractHashtags(hashtag) - if "symbols" in ent: - for symbol in ent["symbols"]: + if "symbols" in entities: + for symbol in entities["symbols"]: replacements.extractHashtags(symbol) - if "user_mentions" in ent: - for mention in ent["user_mentions"]: + if "user_mentions" in entities: + for mention in entities["user_mentions"]: let name = mention{"screen_name"}.getStr slice = mention.extractSlice @@ -287,47 +277,27 @@ proc expandTweetEntities*(tweet: Tweet; js: JsonNode) = replacements.deduplicate replacements.sort(cmp) - tweet.text = orig.replacedWith(replacements, textSlice) - .strip(leading=false) + tweet.text = text.toRunes.replacedWith(replacements, textSlice).strip(leading=false) -proc expandNoteTweetEntities*(tweet: Tweet; noteTweet: JsonNode) = +proc expandTweetEntities*(tweet: Tweet; js: JsonNode) = let - text = noteTweet{"text"}.getStr - orig = text.toRunes - ent = ? noteTweet{"entity_set"} - hasCard = tweet.card.isSome + entities = ? js{"entities"} + hasQuote = js{"is_quote_status"}.getBool + textRange = js{"display_text_range"} + textSlice = textRange{0}.getInt .. textRange{1}.getInt - var replacements = newSeq[ReplaceSlice]() + var replyTo = "" + if tweet.replyId != 0: + with reply, js{"in_reply_to_screen_name"}: + replyTo = reply.getStr + tweet.reply.add replyTo - with urls, ent{"urls"}: - for u in urls: - let urlStr = u["url"].getStr - if urlStr.len == 0 or urlStr notin text: - continue - replacements.extractUrls(u, orig.len, hideTwitter = false) - if hasCard and u{"url"}.getStr == get(tweet.card).url: - get(tweet.card).url = u{"expanded_url"}.getStr + tweet.expandTextEntities(entities, tweet.text, textSlice, replyTo, hasQuote) - if "hashtags" in ent: - for hashtag in ent["hashtags"]: - replacements.extractHashtags(hashtag) +proc expandNoteTweetEntities*(tweet: Tweet; js: JsonNode) = + let + entities = ? js{"entity_set"} + text = js{"text"}.getStr + textSlice = 0..text.runeLen - if "symbols" in ent: - for symbol in ent["symbols"]: - replacements.extractHashtags(symbol) - - if "user_mentions" in ent: - for mention in ent["user_mentions"]: - let - name = mention{"screen_name"}.getStr - slice = mention.extractSlice - idx = tweet.reply.find(name) - - replacements.add ReplaceSlice(kind: rkMention, slice: slice, - url: "/" & name, display: mention["name"].getStr) - - replacements.deduplicate - replacements.sort(cmp) - - tweet.text = orig.replacedWith(replacements, 0..orig.len) - .strip(leading=false) + tweet.expandTextEntities(entities, text, textSlice) diff --git a/tests/test_card.py b/tests/test_card.py index 77e44e1..51945d6 100644 --- a/tests/test_card.py +++ b/tests/test_card.py @@ -42,7 +42,7 @@ no_thumb = [ ['nim_lang/status/1082989146040340480', 'Nim in 2018: A short recap', - '36 votes and 46 comments so far on Reddit', + 'Posted by u/miran1 - 36 votes and 46 comments', 'reddit.com'] ]