twarc/expansions.py at main · DocNow/twarc · GitHub Skip to content Sign up Sign up Why GitHub? Features → Mobile → Actions → Codespaces → Packages → Security → Code review → Project management → Integrations → GitHub Sponsors → Customer stories→ Team Enterprise Explore Explore GitHub → Learn and contribute Topics → Collections → Trending → Learning Lab → Open source guides → Connect with others The ReadME Project → Events → Community forum → GitHub Education → GitHub Stars program → Marketplace Pricing Plans → Compare plans → Contact Sales → Education → In this repository All GitHub ↵ Jump to ↵ No suggested jump to results In this repository All GitHub ↵ Jump to ↵ In this organization All GitHub ↵ Jump to ↵ In this repository All GitHub ↵ Jump to ↵ Sign in Sign up Sign up {{ message }} DocNow / twarc Notifications Star 1k Fork 214 Code Issues 53 Pull requests 0 Actions Projects 0 Wiki Security Insights More Code Issues Pull requests Actions Projects Wiki Security Insights Permalink main Switch branches/tags Branches Tags Nothing to show {{ refName }} default View all branches Nothing to show {{ refName }} default View all tags twarc/twarc/expansions.py / Jump to Code definitions extract_includes Function flatten Function expand_payload Function Code navigation index up-to-date Go to file Go to file T Go to line L Go to definition R Copy path Copy permalink     Cannot retrieve contributors at this time 207 lines (177 sloc) 6.18 KB Raw Blame Open with Desktop View raw View blame """ This module contains a list of the known Twitter V2+ API expansions and fields for each expansion, and a function for "flattening" a result set, including all expansions inline """ from collections import defaultdict EXPANSIONS = [ "author_id", "in_reply_to_user_id", "referenced_tweets.id", "referenced_tweets.id.author_id", "entities.mentions.username", "attachments.poll_ids", "attachments.media_keys", "geo.place_id", ] USER_FIELDS = [ "created_at", "description", "entities", "id", "location", "name", "pinned_tweet_id", "profile_image_url", "protected", "public_metrics", "url", "username", "verified", "withheld", ] TWEET_FIELDS = [ "attachments", "author_id", "context_annotations", "conversation_id", "created_at", "entities", "geo", "id", "in_reply_to_user_id", "lang", "public_metrics", # "non_public_metrics", # private # "organic_metrics", # private # "promoted_metrics", # private "text", "possibly_sensitive", "referenced_tweets", "reply_settings", "source", "withheld", ] MEDIA_FIELDS = [ "duration_ms", "height", "media_key", "preview_image_url", "type", "url", "width", # "non_public_metrics", # private # "organic_metrics", # private # "promoted_metrics", # private "public_metrics", ] POLL_FIELDS = ["duration_minutes", "end_datetime", "id", "options", "voting_status"] PLACE_FIELDS = [ "contained_within", "country", "country_code", "full_name", "geo", "id", "name", "place_type", ] EVERYTHING = { "expansions": ",".join(EXPANSIONS), "user.fields": ",".join(USER_FIELDS), "tweet.fields": ",".join(TWEET_FIELDS), "media.fields": ",".join(MEDIA_FIELDS), "poll.fields": ",".join(POLL_FIELDS), "place.fields": ",".join(PLACE_FIELDS), } # For endpoints focused on user objects such as looking up users and followers. # Not all of the expansions are available for these endpoints. USER_EVERYTHING = { "expansions": "pinned_tweet_id", "tweet.fields": ",".join(TWEET_FIELDS), "user.fields": ",".join(USER_FIELDS), } def extract_includes(response, expansion, _id="id"): if "includes" in response and expansion in response["includes"]: return defaultdict( lambda: {}, {include[_id]: include for include in response["includes"][expansion]}, ) else: return defaultdict(lambda: {}) def flatten(response): """ Flatten the response. Expects an entire page response from the API (data, includes, meta) Defaults: Return empty objects for things missing in includes. Doesn't modify tweets, only adds extra data. """ # Users extracted both by id and by username for expanding mentions includes_users = defaultdict( lambda: {}, { **extract_includes(response, "users", "id"), **extract_includes(response, "users", "username"), }, ) # Media is by media_key, not id includes_media = extract_includes(response, "media", "media_key") includes_polls = extract_includes(response, "polls") includes_places = extract_includes(response, "places") # Tweets in includes will themselves be expanded includes_tweets = extract_includes(response, "tweets") # Errors are returned but unused here for now includes_errors = extract_includes(response, "errors") def expand_payload(payload): """ Recursively step through an object and sub objects and append extra data. Can be applied to any tweet, list of tweets, sub object of tweet etc. """ # Don't try to expand on primitive values, return strings as is: if isinstance(payload, (str, bool, int, float)): return payload # expand list items individually: elif isinstance(payload, list): payload = [expand_payload(item) for item in payload] return payload # Try to expand on dicts within dicts: elif isinstance(payload, dict): for key, value in payload.items(): payload[key] = expand_payload(value) if "author_id" in payload: payload["author"] = includes_users[payload["author_id"]] if "in_reply_to_user_id" in payload: payload["in_reply_to_user"] = includes_users[payload["in_reply_to_user_id"]] if "media_keys" in payload: payload["media"] = list( includes_media[media_key] for media_key in payload["media_keys"] ) if "poll_ids" in payload and len(payload["poll_ids"]) > 0: poll_id = payload["poll_ids"][-1] # only ever 1 poll per tweet. payload["poll"] = includes_polls[poll_id] if "geo" in payload and "place_id" in payload["geo"]: place_id = payload["geo"]["place_id"] payload["geo"] = {**payload["geo"], **includes_places[place_id]} if "mentions" in payload: payload["mentions"] = list( {**referenced_user, **includes_users[referenced_user["username"]]} for referenced_user in payload["mentions"] ) if "referenced_tweets" in payload: payload["referenced_tweets"] = list( {**referenced_tweet, **includes_tweets[referenced_tweet["id"]]} for referenced_tweet in payload["referenced_tweets"] ) if "pinned_tweet_id" in payload: payload["pinned_tweet"] = includes_tweets[payload["pinned_tweet_id"]] return payload # First, expand the included tweets, before processing actual result tweets: for included_id, included_tweet in extract_includes(response, "tweets").items(): includes_tweets[included_id] = expand_payload(included_tweet) # Now flatten the list of tweets or an individual tweet if "data" in response: response["data"] = expand_payload(response["data"]) # Add the __twarc metadata to each tweet if it's a result set if "__twarc" in response and isinstance(response["data"], list): for tweet in response["data"]: tweet["__twarc"] = response["__twarc"] return response Copy lines Copy permalink View git blame Reference in new issue Go © 2021 GitHub, Inc. Terms Privacy Security Status Docs Contact GitHub Pricing API Training Blog About You can’t perform that action at this time. You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.