From c9b261a79303189f61ef5f5c6bf2c2600cdba792 Mon Sep 17 00:00:00 2001 From: Zed Date: Sun, 30 Jan 2022 23:38:39 +0100 Subject: [PATCH] WIP tweets/timeline parser --- src/api.nim | 4 +- src/experimental/parser/media.nim | 44 +++++++++++ src/experimental/parser/slices.nim | 20 ++--- src/experimental/parser/timeline.nim | 78 +++++++++++++++++--- src/experimental/parser/tweet.nim | 97 +++++++++++++++++++++++++ src/experimental/parser/unifiedcard.nim | 12 ++- src/experimental/parser/utils.nim | 9 ++- src/experimental/types/common.nim | 8 +- src/experimental/types/media.nim | 15 ++++ src/experimental/types/timeline.nim | 11 ++- src/experimental/types/tweet.nim | 85 ++++++++++++++++++++++ src/experimental/types/unifiedcard.nim | 20 +---- src/experimental/types/user.nim | 6 ++ 13 files changed, 359 insertions(+), 50 deletions(-) create mode 100644 src/experimental/parser/media.nim create mode 100644 src/experimental/parser/tweet.nim create mode 100644 src/experimental/types/media.nim create mode 100644 src/experimental/types/tweet.nim diff --git a/src/api.nim b/src/api.nim index 708b72f..433bca7 100644 --- a/src/api.nim +++ b/src/api.nim @@ -89,8 +89,8 @@ proc getSearch*[T](query: Query; after=""): Future[Result[T]] {.async.} = else: const searchMode = ("tweet_search_mode", "live") - parse = parseTimeline - fetchFunc = fetch + parse = parseTweets + fetchFunc = fetchRaw let q = genQueryParam(query) if q.len == 0 or q == emptyQuery: diff --git a/src/experimental/parser/media.nim b/src/experimental/parser/media.nim new file mode 100644 index 0000000..8af5c8b --- /dev/null +++ b/src/experimental/parser/media.nim @@ -0,0 +1,44 @@ +import std/[json, strutils, times, math] +import utils +import ".."/types/[media, tweet] +from ../../types import Poll, Gif, Video, VideoVariant, VideoType + +proc parseVideo*(entity: Entity): Video = + result = Video( + thumb: entity.mediaUrlHttps.getImageUrl, + views: entity.ext.mediaStats{"r", "ok", "viewCount"}.getStr, + available: entity.extMediaAvailability.status == "available", + title: entity.extAltText, + durationMs: entity.videoInfo.durationMillis, + description: entity.additionalMediaInfo.description, + variants: entity.videoInfo.variants + # playbackType: mp4 + ) + + if entity.additionalMediaInfo.title.len > 0: + result.title = entity.additionalMediaInfo.title + +proc parseGif*(entity: Entity): Gif = + result = Gif( + url: entity.videoInfo.variants[0].url.getImageUrl, + thumb: entity.getImageUrl + ) + +proc parsePoll*(card: Card): Poll = + let vals = card.bindingValues + + # name format is pollNchoice_* + for i in '1' .. card.name[4]: + let choice = "choice" & i + result.values.add parseInt(vals{choice & "_count", "string_value"}.getStr("0")) + result.options.add vals{choice & "_label", "string_value"}.getStr + + let time = vals{"end_datetime_utc", "string_value"}.getStr.parseIsoDate + if time > now(): + let timeLeft = $(time - now()) + result.status = timeLeft[0 ..< timeLeft.find(",")] + else: + result.status = "Final results" + + result.leader = result.values.find(max(result.values)) + result.votes = result.values.sum diff --git a/src/experimental/parser/slices.nim b/src/experimental/parser/slices.nim index 45e6e1d..887b8cc 100644 --- a/src/experimental/parser/slices.nim +++ b/src/experimental/parser/slices.nim @@ -1,15 +1,14 @@ import std/[macros, htmlgen, unicode] -import ../types/common import ".."/../[formatters, utils] type - ReplaceSliceKind = enum + ReplaceSliceKind* = enum rkRemove, rkUrl, rkHashtag, rkMention ReplaceSlice* = object - slice: Slice[int] - kind: ReplaceSliceKind - url, display: string + slice*: Slice[int] + kind*: ReplaceSliceKind + url*, display*: string proc cmp*(x, y: ReplaceSlice): int = cmp(x.slice.a, y.slice.b) @@ -27,11 +26,14 @@ proc dedupSlices*(s: var seq[ReplaceSlice]) = inc j inc i -proc extractUrls*(result: var seq[ReplaceSlice]; url: Url; - textLen: int; hideTwitter = false) = +proc extractHashtags*(result: var seq[ReplaceSlice]; slice: Slice[int]) = + result.add ReplaceSlice(kind: rkHashtag, slice: slice) + +proc extractUrls*[T](result: var seq[ReplaceSlice]; entity: T; + textLen: int; hideTwitter = false) = let - link = url.expandedUrl - slice = url.indices[0] ..< url.indices[1] + link = entity.expandedUrl + slice = entity.indices if hideTwitter and slice.b.succ >= textLen and link.isTwitterUrl: if slice.a < textLen: diff --git a/src/experimental/parser/timeline.nim b/src/experimental/parser/timeline.nim index 351ca85..8f7a8b6 100644 --- a/src/experimental/parser/timeline.nim +++ b/src/experimental/parser/timeline.nim @@ -1,13 +1,50 @@ -import std/[strutils, tables] +import std/[strutils, tables, options] import jsony -import user, ../types/timeline -from ../../types import Result, User +import user, tweet, utils, ../types/timeline +from ../../types import Result, User, Tweet + +proc parseHook(s: string; i: var int; v: var Slice[int]) = + var slice: array[2, int] + parseHook(s, i, slice) + v = slice[0] ..< slice[1] proc getId(id: string): string {.inline.} = let start = id.rfind("-") if start < 0: return id id[start + 1 ..< id.len] +proc processTweet(id: string; objects: GlobalObjects; + userCache: var Table[string, User]): Tweet = + let raw = objects.tweets[id] + result = toTweet raw + + let uid = result.user.id + if uid.len > 0 and uid in objects.users: + if uid notin userCache: + userCache[uid] = toUser objects.users[uid] + result.user = userCache[uid] + + let rtId = raw.retweetedStatusIdStr + if rtId.len > 0: + if rtId in objects.tweets: + result.retweet = some processTweet(rtId, objects, userCache) + else: + result.retweet = some Tweet(id: rtId.toId) + + let qId = raw.quotedStatusIdStr + if qId.len > 0: + if qId in objects.tweets: + result.quote = some processTweet(qId, objects, userCache) + else: + result.quote = some Tweet(id: qId.toId) + +proc parseCursor[T](e: Entry; result: var Result[T]) = + let cursor = e.content.operation.cursor + if cursor.cursorType == "Top": + result.top = cursor.value + elif cursor.cursorType == "Bottom": + result.bottom = cursor.value + proc parseUsers*(json: string; after=""): Result[User] = result = Result[User](beginning: after.len == 0) @@ -16,13 +53,32 @@ proc parseUsers*(json: string; after=""): Result[User] = return for e in raw.timeline.instructions[0].addEntries.entries: - let id = e.entryId.getId - if e.entryId.startsWith("user"): + let + eId = e.entryId + id = eId.getId + + if eId.startsWith("user") or eId.startsWith("sq-U"): if id in raw.globalObjects.users: result.content.add toUser raw.globalObjects.users[id] - elif e.entryId.startsWith("cursor"): - let cursor = e.content.operation.cursor - if cursor.cursorType == "Top": - result.top = cursor.value - elif cursor.cursorType == "Bottom": - result.bottom = cursor.value + elif eId.startsWith("cursor") or eId.startsWith("sq-C"): + parseCursor(e, result) + +proc parseTweets*(json: string; after=""): Result[Tweet] = + result = Result[Tweet](beginning: after.len == 0) + + let raw = json.fromJson(Search) + if raw.timeline.instructions.len == 0: + return + + var userCache: Table[string, User] + + for e in raw.timeline.instructions[0].addEntries.entries: + let + eId = e.entryId + id = eId.getId + + if eId.startsWith("tweet") or eId.startsWith("sq-I-t"): + if id in raw.globalObjects.tweets: + result.content.add processTweet(id, raw.globalObjects, userCache) + elif eId.startsWith("cursor") or eId.startsWith("sq-C"): + parseCursor(e, result) diff --git a/src/experimental/parser/tweet.nim b/src/experimental/parser/tweet.nim new file mode 100644 index 0000000..f7e31bb --- /dev/null +++ b/src/experimental/parser/tweet.nim @@ -0,0 +1,97 @@ +import std/[strutils, options, algorithm, json] +import std/unicode except strip +import utils, slices, media, user +import ../types/tweet +from ../types/media as mediaTypes import MediaType +from ../../types import Tweet, User, TweetStats + +proc expandTweetEntities(tweet: var Tweet; raw: RawTweet) = + let + orig = raw.fullText.toRunes + textRange = raw.displayTextRange + textSlice = textRange[0] .. textRange[1] + hasCard = raw.card.isSome + + var replyTo = "" + if tweet.replyId > 0: + tweet.reply.add raw.inReplyToScreenName + replyTo = raw.inReplyToScreenName + + var replacements = newSeq[ReplaceSlice]() + + for u in raw.entities.urls: + if u.url.len == 0 or u.url notin raw.fullText: + continue + + replacements.extractUrls(u, textSlice.b, hideTwitter=raw.isQuoteStatus) + # if hasCard and u.url == get(tweet.card).url: + # get(tweet.card).url = u.expandedUrl + + for m in raw.entities.media: + replacements.extractUrls(m, textSlice.b, hideTwitter=true) + + for hashtag in raw.entities.hashtags: + replacements.extractHashtags(hashtag.indices) + + for symbol in raw.entities.symbols: + replacements.extractHashtags(symbol.indices) + + for mention in raw.entities.userMentions: + let + name = mention.screenName + idx = tweet.reply.find(name) + + if mention.indices.a >= textSlice.a: + replacements.add ReplaceSlice(kind: rkMention, slice: mention.indices, + url: "/" & name, display: mention.name) + if idx > -1 and name != replyTo: + tweet.reply.delete idx + elif idx == -1 and tweet.replyId != 0: + tweet.reply.add name + + replacements.dedupSlices + replacements.sort(cmp) + + tweet.text = orig.replacedWith(replacements, textSlice) + .strip(leading=false) + +proc toTweet*(raw: RawTweet): Tweet = + result = Tweet( + id: raw.idStr.toId, + threadId: raw.conversationIdStr.toId, + replyId: raw.inReplyToStatusIdStr.toId, + time: parseTwitterDate(raw.createdAt), + hasThread: raw.selfThread.idStr.len > 0, + available: true, + user: User(id: raw.userIdStr), + stats: TweetStats( + replies: raw.replyCount, + retweets: raw.retweetCount, + likes: raw.favoriteCount, + quotes: raw.quoteCount + ) + ) + + result.expandTweetEntities(raw) + + if raw.card.isSome: + let card = raw.card.get + if "poll" in card.name: + result.poll = some parsePoll(card) + if "image" in card.name: + result.photos.add card.bindingValues{"image_large", "image_value", "url"} + .getStr.getImageUrl + # elif card.name == "amplify": + # discard + # # result.video = some(parsePromoVideo(jsCard{"binding_values"})) + # else: + # result.card = some parseCard(card, raw.entities.urls) + + for m in raw.extendedEntities.media: + case m.kind + of photo: result.photos.add m.getImageUrl + of video: + result.video = some parseVideo(m) + if m.additionalMediaInfo.sourceUser.isSome: + result.attribution = some toUser get(m.additionalMediaInfo.sourceUser) + of animatedGif: result.gif = some parseGif(m) diff --git a/src/experimental/parser/unifiedcard.nim b/src/experimental/parser/unifiedcard.nim index 337c3b9..1cffb8a 100644 --- a/src/experimental/parser/unifiedcard.nim +++ b/src/experimental/parser/unifiedcard.nim @@ -1,11 +1,8 @@ -import std/[options, tables, strutils, strformat, sugar] +import std/[options, tables, strformat] import jsony -import ../types/unifiedcard +import utils +import ".."/types/[unifiedcard, media] from ../../types import Card, CardKind, Video -from ../../utils import twimg, https - -proc getImageUrl(entity: MediaEntity): string = - entity.mediaUrlHttps.dup(removePrefix(twimg), removePrefix(https)) proc parseDestination(id: string; card: UnifiedCard; result: var Card) = let destination = card.destinationObjects[id].data @@ -66,6 +63,7 @@ proc parseMedia(component: Component; card: UnifiedCard; result: var Card) = durationMs: videoInfo.durationMillis, variants: videoInfo.variants ) + of animatedGif: discard proc parseUnifiedCard*(json: string): Card = let card = json.fromJson(UnifiedCard) @@ -78,7 +76,7 @@ proc parseUnifiedCard*(json: string): Card = component.data.parseAppDetails(card, result) of mediaWithDetailsHorizontal: component.data.parseMediaDetails(card, result) - of media, swipeableMedia: + of ComponentType.media, swipeableMedia: component.parseMedia(card, result) of buttonGroup: discard diff --git a/src/experimental/parser/utils.nim b/src/experimental/parser/utils.nim index 999683d..17afd83 100644 --- a/src/experimental/parser/utils.nim +++ b/src/experimental/parser/utils.nim @@ -1,12 +1,16 @@ # SPDX-License-Identifier: AGPL-3.0-only import std/[sugar, strutils, times] -import ../types/common +import ".."/types/[common, media, tweet] import ../../utils as uutils template parseTime(time: string; f: static string; flen: int): DateTime = if time.len != flen: return parse(time, f, utc()) +proc toId*(id: string): int64 = + if id.len == 0: 0'i64 + else: parseBiggestInt(id) + proc parseIsoDate*(date: string): DateTime = date.parseTime("yyyy-MM-dd\'T\'HH:mm:ss\'Z\'", 20) @@ -16,6 +20,9 @@ proc parseTwitterDate*(date: string): DateTime = proc getImageUrl*(url: string): string = url.dup(removePrefix(twimg), removePrefix(https)) +proc getImageUrl*(entity: MediaEntity | Entity): string = + entity.mediaUrlHttps.getImageUrl + template handleErrors*(body) = if json.startsWith("{\"errors"): for error {.inject.} in json.fromJson(Errors).errors: diff --git a/src/experimental/types/common.nim b/src/experimental/types/common.nim index e979015..512a095 100644 --- a/src/experimental/types/common.nim +++ b/src/experimental/types/common.nim @@ -1,3 +1,4 @@ +import jsony from ../../types import Error type @@ -5,7 +6,7 @@ type url*: string expandedUrl*: string displayUrl*: string - indices*: array[2, int] + indices*: Slice[int] ErrorObj* = object code*: Error @@ -18,3 +19,8 @@ proc contains*(codes: set[Error]; errors: Errors): bool = for e in errors.errors: if e.code in codes: return true + +proc parseHook*(s: string; i: var int; v: var Slice[int]) = + var slice: array[2, int] + parseHook(s, i, slice) + v = slice[0] ..< slice[1] diff --git a/src/experimental/types/media.nim b/src/experimental/types/media.nim new file mode 100644 index 0000000..924a25e --- /dev/null +++ b/src/experimental/types/media.nim @@ -0,0 +1,15 @@ +import options +from ../../types import VideoType, VideoVariant + +type + MediaType* = enum + photo, video, animatedGif + + MediaEntity* = object + kind*: MediaType + mediaUrlHttps*: string + videoInfo*: Option[VideoInfo] + + VideoInfo* = object + durationMillis*: int + variants*: seq[VideoVariant] diff --git a/src/experimental/types/timeline.nim b/src/experimental/types/timeline.nim index 28239ad..c87e511 100644 --- a/src/experimental/types/timeline.nim +++ b/src/experimental/types/timeline.nim @@ -1,13 +1,14 @@ import std/tables -import user +import user, tweet type Search* = object globalObjects*: GlobalObjects timeline*: Timeline - GlobalObjects = object + GlobalObjects* = object users*: Table[string, RawUser] + tweets*: Table[string, RawTweet] Timeline = object instructions*: seq[Instructions] @@ -15,9 +16,13 @@ type Instructions = object addEntries*: tuple[entries: seq[Entry]] - Entry = object + Entry* = object entryId*: string content*: tuple[operation: Operation] Operation = object cursor*: tuple[value, cursorType: string] + +proc renameHook*(v: var Entity; fieldName: var string) = + if fieldName == "type": + fieldName = "kind" diff --git a/src/experimental/types/tweet.nim b/src/experimental/types/tweet.nim new file mode 100644 index 0000000..3004167 --- /dev/null +++ b/src/experimental/types/tweet.nim @@ -0,0 +1,85 @@ +import options +import jsony +from json import JsonNode +import user, media, common + +type + RawTweet* = object + createdAt*: string + idStr*: string + fullText*: string + displayTextRange*: array[2, int] + entities*: Entities + extendedEntities*: ExtendedEntities + inReplyToStatusIdStr*: string + inReplyToScreenName*: string + userIdStr*: string + isQuoteStatus*: bool + replyCount*: int + retweetCount*: int + favoriteCount*: int + quoteCount*: int + conversationIdStr*: string + favorited*: bool + retweeted*: bool + selfThread*: tuple[idStr: string] + card*: Option[Card] + quotedStatusIdStr*: string + retweetedStatusIdStr*: string + + Card* = object + name*: string + url*: string + bindingValues*: JsonNode + + Entities* = object + hashtags*: seq[Hashtag] + symbols*: seq[Hashtag] + userMentions*: seq[UserMention] + urls*: seq[Url] + media*: seq[Entity] + + Hashtag* = object + indices*: Slice[int] + + UserMention* = object + screenName*: string + name*: string + indices*: Slice[int] + + ExtendedEntities* = object + media*: seq[Entity] + + Entity* = object + kind*: MediaType + indices*: Slice[int] + mediaUrlHttps*: string + url*: string + expandedUrl*: string + videoInfo*: VideoInfo + ext*: Ext + extMediaAvailability*: tuple[status: string] + extAltText*: string + additionalMediaInfo*: AdditionalMediaInfo + sourceStatusIdStr*: string + sourceUserIdStr*: string + + AdditionalMediaInfo* = object + sourceUser*: Option[RawUser] + title*: string + description*: string + + Ext* = object + mediaStats*: JsonNode + + MediaStats* = object + ok*: tuple[viewCount: string] + +proc renameHook*(v: var Entity; fieldName: var string) = + if fieldName == "type": + fieldName = "kind" + +proc parseHook*(s: string; i: var int; v: var Slice[int]) = + var slice: array[2, int] + parseHook(s, i, slice) + v = slice[0] ..< slice[1] diff --git a/src/experimental/types/unifiedcard.nim b/src/experimental/types/unifiedcard.nim index 16500df..eba0385 100644 --- a/src/experimental/types/unifiedcard.nim +++ b/src/experimental/types/unifiedcard.nim @@ -1,5 +1,5 @@ import options, tables -from ../../types import VideoType, VideoVariant +import media as mediaTypes type UnifiedCard* = object @@ -38,25 +38,13 @@ type id*: string destination*: string - Destination* = object - kind*: string - data*: tuple[urlData: UrlData] - UrlData* = object url*: string vanity*: string - MediaType* = enum - photo, video - - MediaEntity* = object - kind*: MediaType - mediaUrlHttps*: string - videoInfo*: Option[VideoInfo] - - VideoInfo* = object - durationMillis*: int - variants*: seq[VideoVariant] + Destination* = object + kind*: string + data*: tuple[urlData: UrlData] AppType* = enum androidApp, iPhoneApp, iPadApp diff --git a/src/experimental/types/user.nim b/src/experimental/types/user.nim index 1c8a5c3..a7fd727 100644 --- a/src/experimental/types/user.nim +++ b/src/experimental/types/user.nim @@ -1,4 +1,5 @@ import options +import jsony import common type @@ -41,3 +42,8 @@ type Color* = object red*, green*, blue*: int + +proc parseHook*(s: string; i: var int; v: var Slice[int]) = + var slice: array[2, int] + parseHook(s, i, slice) + v = slice[0] ..< slice[1]