Skip to content
Permalink
main
Switch branches/tags
Go to file
 
 
Cannot retrieve contributors at this time
executable file 136 lines (110 sloc) 3.35 KB
#!/usr/bin/env python
"""
Unfortunately the "expanded_url" as supplied by Twitter aren't fully
expanded one hop past t.co.
unshrtn.py will attempt to completely unshorten URLs and add them as the
"unshortened_url" key to each url, and emit the tweet as JSON again on stdout.
This script starts 10 seaprate processes which talk to an instance of unshrtn
that is running:
http://github.com/edsu/unshrtn
"""
import re
import json
import time
import urllib.request, urllib.parse, urllib.error
import logging
import argparse
import fileinput
import multiprocessing
# number of urls to look up in parallel
POOL_SIZE = 10
unshrtn_url = "http://localhost:3000"
retries = 2
wait = 15
logging.basicConfig(filename="unshorten.log", level=logging.INFO)
def unshorten_url(url):
if url is None:
return None
# TODO: Worth providing some way for the user to specify specific hostnames they want to expand,
# instead of assuming that all hostnames need expanding?
if re.match(r"^https?://twitter.com/", url):
return url
u = "{}/?{}".format(
unshrtn_url, urllib.parse.urlencode({"url": url.encode("utf8")})
)
resp = None
for retry in range(0, retries):
try:
resp = json.loads(urllib.request.urlopen(u).read().decode("utf-8"))
break
except Exception as e:
logging.error(
"http error: %s when looking up %s. Try %s of %s",
e,
url,
retry,
retries,
)
time.sleep(wait)
for key in ["canonical", "long"]:
if key in resp:
return resp[key]
return None
def rewrite_line(line):
try:
tweet = json.loads(line)
except Exception as e:
# garbage in, garbage out
logging.error(e)
return line
for url_dict in tweet["entities"]["urls"]:
if "expanded_url" in url_dict:
url = url_dict["expanded_url"]
else:
url = url_dict["url"]
url_dict["unshortened_url"] = unshorten_url(url)
tweet["user"]["unshortened_url"] = unshorten_url(tweet["user"]["url"])
return json.dumps(tweet)
def main():
global unshrtn_url, retries, wait
parser = argparse.ArgumentParser()
parser.add_argument(
"--pool-size",
help="number of urls to look up in parallel",
default=POOL_SIZE,
type=int,
)
parser.add_argument(
"--unshrtn", help="url of the unshrtn service", default=unshrtn_url
)
parser.add_argument(
"--retries",
help="number of time to retry if error from unshrtn service",
default=retries,
type=int,
)
parser.add_argument(
"--wait",
help="number of seconds to wait between retries if error from unshrtn service",
default=wait,
type=int,
)
parser.add_argument(
"files",
metavar="FILE",
nargs="*",
help="files to read, if empty, stdin is used",
)
args = parser.parse_args()
unshrtn_url = args.unshrtn
retries = args.retries
wait = args.wait
pool = multiprocessing.Pool(args.pool_size)
for line in pool.imap_unordered(
rewrite_line,
fileinput.input(files=args.files if len(args.files) > 0 else ("-",)),
):
if line != "\n":
print(line)
if __name__ == "__main__":
main()