twarc/wordcloud.py at main · DocNow/twarc · GitHub Skip to content Sign up Sign up Why GitHub? Features → Mobile → Actions → Codespaces → Packages → Security → Code review → Project management → Integrations → GitHub Sponsors → Customer stories→ Team Enterprise Explore Explore GitHub → Learn and contribute Topics → Collections → Trending → Learning Lab → Open source guides → Connect with others The ReadME Project → Events → Community forum → GitHub Education → GitHub Stars program → Marketplace Pricing Plans → Compare plans → Contact Sales → Education → In this repository All GitHub ↵ Jump to ↵ No suggested jump to results In this repository All GitHub ↵ Jump to ↵ In this organization All GitHub ↵ Jump to ↵ In this repository All GitHub ↵ Jump to ↵ Sign in Sign up Sign up {{ message }} DocNow / twarc Notifications Star 1k Fork 214 Code Issues 53 Pull requests 0 Actions Projects 0 Wiki Security Insights More Code Issues Pull requests Actions Projects Wiki Security Insights Permalink main Switch branches/tags Branches Tags Nothing to show {{ refName }} default View all branches Nothing to show {{ refName }} default View all tags twarc/utils/wordcloud.py / Jump to Code definitions No definitions found in this file. Code navigation not available for this commit Go to file Go to file T Go to line L Go to definition R Copy path Copy permalink Cannot retrieve contributors at this time executable file 116 lines (98 sloc) 3.96 KB Raw Blame Open with Desktop View raw View blame #!/usr/bin/env python from __future__ import print_function import re import sys import json import fileinput def main(): try: from urllib import urlopen # Python 2 except ImportError: from urllib.request import urlopen # Python 3 MAX_WORDS = 100 word_counts = {} stop_words = set(["a","able","about","across","actually","after","against","agreed","all","almost","already","also","am","among","an","and","any","anyone","anyway","are","as","at","be","because","been","being","between","but","by","can","cannot","come","could","dear","did","do","does","either","else","ever","every","for","from","get","getting","got","had","has","have","he","her","here","hers","hey","hi","him","his","how","however","i","i'd","i'll","i'm","if","in","into","is","isnt","isn't","it","its","just","kind","last","latest","least","let","like","likely","look","make","may","me","might","more","most","must","my","neither","new","no","nor","not","now","of","off","often","on","only","or","other","our","out","over","own","part","piece","play","put","putting","rather","real","really","said","say","says","she","should","simply","since","so","some","than","thanks","that","that's","thats","the","their","them","then","there","these","they","they're","this","those","tis","to","too","try","twas","us","use","used","uses","via","wants","was","way","we","well","were","what","when","where","which","while","who","whom","why","will","with","would","yet","you","your","you're","youre"]) for line in fileinput.input(): try: tweet = json.loads(line) except: pass for word in text(tweet).split(' '): word = word.lower() word = word.replace(".", "") word = word.replace(",", "") word = word.replace("...", "") word = word.replace("'", "") word = word.replace(":", "") word = word.replace("(", "") word = word.replace(")", "") if len(word) < 3: continue if len(word) > 15: continue if word in stop_words: continue if word[0] in ["@", "#"]: continue if re.match('https?', word): continue if word.startswith("rt"): continue if not re.match('^[a-z]', word, re.IGNORECASE): continue word_counts[word] = word_counts.get(word, 0) + 1 sorted_words = list(word_counts.keys()) sorted_words.sort(key = lambda x: word_counts[x], reverse=True) top_words = sorted_words[0:MAX_WORDS] words = [] count_range = word_counts[top_words[0]] - word_counts[top_words[-1]] + 1 size_ratio = 100.0 / count_range for word in top_words: size = int(word_counts[word] * size_ratio) + 15 words.append({ "text": word, "size": size }) wordcloud_js = urlopen('https://raw.githubusercontent.com/jasondavies/d3-cloud/master/build/d3.layout.cloud.js').read() output = """