twarc/network.py at main · DocNow/twarc · GitHub Skip to content Sign up Sign up Why GitHub? Features → Mobile → Actions → Codespaces → Packages → Security → Code review → Project management → Integrations → GitHub Sponsors → Customer stories→ Team Enterprise Explore Explore GitHub → Learn and contribute Topics → Collections → Trending → Learning Lab → Open source guides → Connect with others The ReadME Project → Events → Community forum → GitHub Education → GitHub Stars program → Marketplace Pricing Plans → Compare plans → Contact Sales → Education → In this repository All GitHub ↵ Jump to ↵ No suggested jump to results In this repository All GitHub ↵ Jump to ↵ In this organization All GitHub ↵ Jump to ↵ In this repository All GitHub ↵ Jump to ↵ Sign in Sign up Sign up {{ message }} DocNow / twarc Notifications Star 1k Fork 214 Code Issues 53 Pull requests 0 Actions Projects 0 Wiki Security Insights More Code Issues Pull requests Actions Projects Wiki Security Insights Permalink main Switch branches/tags Branches Tags Nothing to show {{ refName }} default View all branches Nothing to show {{ refName }} default View all tags twarc/utils/network.py / Jump to Code definitions No definitions found in this file. Code navigation not available for this commit Go to file Go to file T Go to line L Go to definition R Copy path Copy permalink     Cannot retrieve contributors at this time executable file 363 lines (294 sloc) 9.06 KB Raw Blame Open with Desktop View raw View blame #!/usr/bin/env python # build a reply, quote, retweet network from a file of tweets and write it # out as a gexf, dot, json or html file. You will need to have networkx # installed and pydotplus if you want to use dot. The html presentation # uses d3 to display the network graph in your browser. # # ./network.py tweets.jsonl network.html # # or # ./network.py tweets.jsonl network.dot # # or # # ./network.py tweets.jsonl network.gexf # # if you would rather have the network oriented around nodes that are users # instead of tweets use the --users flag # # ./network.py --users tweets.jsonl network.gexf # # if you would rather have the network oriented around nodes that are hashtags # instead of tweets or users, use the --hashtags flag # # TODO: this is mostly here some someone can improve it :) import sys import json import networkx import optparse import itertools import time from networkx import nx_pydot from networkx.readwrite import json_graph usage = "network.py tweets.jsonl graph.html" opt_parser = optparse.OptionParser(usage=usage) opt_parser.add_option( "--retweets", dest="retweets", action="store_true", help="include retweets" ) opt_parser.add_option( "--min_subgraph_size", dest="min_subgraph_size", type="int", help="remove any subgraphs with a size smaller than this number" ) opt_parser.add_option( "--max_subgraph_size", dest="max_subgraph_size", type="int", help="remove any subgraphs with a size larger than this number" ) opt_parser.add_option( "--users", dest="users", action="store_true", help="show user relations instead of tweet relations" ) opt_parser.add_option( "--hashtags", dest="hashtags", action="store_true", help="show hashtag relations instead of tweet relations" ) options, args = opt_parser.parse_args() if len(args) != 2: opt_parser.error("must supply input and output file names") tweets, output = args G = networkx.DiGraph() def add(from_user, from_id, to_user, to_id, type, created_at=None): "adds a relation to the graph" # storing start_data will allow for timestamps for gephi timeline, where nodes will appear on screen at their start dataset # and stay on forever after if (options.users or options.hashtags) and to_user: G.add_node(from_user, screen_name=from_user, start_date=created_at) G.add_node(to_user, screen_name=to_user, start_date=created_at) if G.has_edge(from_user, to_user): weight = G[from_user][to_user]['weight'] + 1 else: weight = 1 G.add_edge(from_user, to_user, type=type, weight=weight) elif not options.users and to_id: G.add_node(from_id, screen_name=from_user, type=type) if to_user: G.add_node(to_id, screen_name=to_user) else: G.add_node(to_id) G.add_edge(from_id, to_id, type=type) def to_json(g): j = {"nodes": [], "links": []} for node_id, node_attrs in g.nodes(True): j["nodes"].append({ "id": node_id, "type": node_attrs.get("type"), "screen_name": node_attrs.get("screen_name") }) for source, target, attrs in g.edges(data=True): j["links"].append({ "source": source, "target": target, "type": attrs.get("type") }) return j for line in open(tweets): try: t = json.loads(line) except: continue from_id = t['id_str'] from_user = t['user']['screen_name'] from_user_id = t['user']['id_str'] to_user = None to_id = None # standardize raw created at date to dd/MM/yyyy HH:mm:ss created_at_date = time.strftime('%d/%m/%Y %H:%M:%S', time.strptime(t["created_at"],'%a %b %d %H:%M:%S +0000 %Y')) if options.users: for u in t['entities'].get('user_mentions', []): add(from_user, from_id, u['screen_name'], None, 'reply', created_at_date) elif options.hashtags: hashtags = t['entities'].get('hashtags', []) hashtag_pairs = list(itertools.combinations(hashtags, 2)) # list of all possible hashtag pairs for u in hashtag_pairs: # source hashtag: u[0]['text'] # target hashtag: u[1]['text'] add('#' + u[0]['text'], None, '#' + u[1]['text'], None, 'hashtag', created_at_date) else: if t.get('in_reply_to_status_id_str'): to_id = t['in_reply_to_status_id_str'] to_user = t['in_reply_to_screen_name'] add(from_user, from_id, to_user, to_id, "reply") if t.get('quoted_status'): to_id = t['quoted_status']['id_str'] to_user = t['quoted_status']['user']['screen_name'] to_user_id = t['quoted_status']['user']['id_str'] add(from_user, from_id, to_user, to_id, "quote") if options.retweets and t.get('retweeted_status'): to_id = t['retweeted_status']['id_str'] to_user = t['retweeted_status']['user']['screen_name'] to_user_id = t['retweeted_status']['user']['id_str'] add(from_user, from_id, to_user, to_id, "retweet") if options.min_subgraph_size or options.max_subgraph_size: g_copy = G.copy() for g in networkx.connected_component_subgraphs(G): if options.min_subgraph_size and len(g) < options.min_subgraph_size: g_copy.remove_nodes_from(g.nodes()) elif options.max_subgraph_size and len(g) > options.max_subgraph_size: g_copy.remove_nodes_from(g.nodes()) G = g_copy if output.endswith(".gexf"): networkx.write_gexf(G, output) elif output.endswith(".gml"): networkx.write_gml(G, output) elif output.endswith(".dot"): nx_pydot.write_dot(G, output) elif output.endswith(".json"): json.dump(to_json(G), open(output, "w"), indent=2) elif output.endswith(".html"): graph_data = json.dumps(to_json(G), indent=2) html = """
""" % graph_data open(output, "w").write(html) Copy lines Copy permalink View git blame Reference in new issue Go © 2021 GitHub, Inc. Terms Privacy Security Status Docs Contact GitHub Pricing API Training Blog About You can’t perform that action at this time. You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session.