Permalink
Cannot retrieve contributors at this time
executable file
116 lines (98 sloc)
3.96 KB
#!/usr/bin/env python | |
from __future__ import print_function | |
import re | |
import sys | |
import json | |
import fileinput | |
def main(): | |
try: | |
from urllib import urlopen # Python 2 | |
except ImportError: | |
from urllib.request import urlopen # Python 3 | |
MAX_WORDS = 100 | |
word_counts = {} | |
stop_words = set(["a","able","about","across","actually","after","against","agreed","all","almost","already","also","am","among","an","and","any","anyone","anyway","are","as","at","be","because","been","being","between","but","by","can","cannot","come","could","dear","did","do","does","either","else","ever","every","for","from","get","getting","got","had","has","have","he","her","here","hers","hey","hi","him","his","how","however","i","i'd","i'll","i'm","if","in","into","is","isnt","isn't","it","its","just","kind","last","latest","least","let","like","likely","look","make","may","me","might","more","most","must","my","neither","new","no","nor","not","now","of","off","often","on","only","or","other","our","out","over","own","part","piece","play","put","putting","rather","real","really","said","say","says","she","should","simply","since","so","some","than","thanks","that","that's","thats","the","their","them","then","there","these","they","they're","this","those","tis","to","too","try","twas","us","use","used","uses","via","wants","was","way","we","well","were","what","when","where","which","while","who","whom","why","will","with","would","yet","you","your","you're","youre"]) | |
for line in fileinput.input(): | |
try: | |
tweet = json.loads(line) | |
except: | |
pass | |
for word in text(tweet).split(' '): | |
word = word.lower() | |
word = word.replace(".", "") | |
word = word.replace(",", "") | |
word = word.replace("...", "") | |
word = word.replace("'", "") | |
word = word.replace(":", "") | |
word = word.replace("(", "") | |
word = word.replace(")", "") | |
if len(word) < 3: continue | |
if len(word) > 15: continue | |
if word in stop_words: continue | |
if word[0] in ["@", "#"]: continue | |
if re.match('https?', word): continue | |
if word.startswith("rt"): continue | |
if not re.match('^[a-z]', word, re.IGNORECASE): continue | |
word_counts[word] = word_counts.get(word, 0) + 1 | |
sorted_words = list(word_counts.keys()) | |
sorted_words.sort(key = lambda x: word_counts[x], reverse=True) | |
top_words = sorted_words[0:MAX_WORDS] | |
words = [] | |
count_range = word_counts[top_words[0]] - word_counts[top_words[-1]] + 1 | |
size_ratio = 100.0 / count_range | |
for word in top_words: | |
size = int(word_counts[word] * size_ratio) + 15 | |
words.append({ | |
"text": word, | |
"size": size | |
}) | |
wordcloud_js = urlopen('https://raw.githubusercontent.com/jasondavies/d3-cloud/master/build/d3.layout.cloud.js').read() | |
output = """<!DOCTYPE html> | |
<html> | |
<head> | |
<meta charset="utf-8"> | |
<title>twarc wordcloud</title> | |
<script src="https://d3js.org/d3.v3.min.js"></script> | |
</head> | |
<body> | |
<script> | |
// embed Jason Davies' d3-cloud since it's not available in a CDN | |
%s | |
var fill = d3.scale.category20(); | |
var words = %s | |
d3.layout.cloud().size([800, 800]) | |
.words(words) | |
.rotate(function() { return ~~(Math.random() * 2) * 90; }) | |
.font("Impact") | |
.fontSize(function(d) { return d.size; }) | |
.on("end", draw) | |
.start(); | |
function draw(words) { | |
d3.select("body").append("svg") | |
.attr("width", 1000) | |
.attr("height", 1000) | |
.append("g") | |
.attr("transform", "translate(400,400)") | |
.selectAll("text") | |
.data(words) | |
.enter().append("text") | |
.style("font-size", function(d) { return d.size + "px"; }) | |
.style("font-family", "Impact") | |
.style("fill", function(d, i) { return fill(i); }) | |
.attr("text-anchor", "middle") | |
.attr("transform", function(d) { | |
return "translate(" + [d.x, d.y] + ")rotate(" + d.rotate + ")"; | |
}) | |
.text(function(d) { return d.text; }); | |
} | |
</script> | |
</body> | |
</html> | |
""" % (wordcloud_js.decode('utf8'), json.dumps(words, indent=2)) | |
sys.stdout.write(output) | |
def text(t): | |
if 'full_text' in t: | |
return t['full_text'] | |
return t['text'] | |
if __name__ == "__main__": | |
main() |