swib20/read_participants_data.pl at main · swibcon/swib20 · GitHub Skip to content Sign up Why GitHub? Features → Code review Project management Integrations Actions Packages Security Team management Hosting Mobile Customer stories → Security → Team Enterprise Explore Explore GitHub → Learn & contribute Topics Collections Trending Learning Lab Open source guides Connect with others Events Community forum GitHub Education GitHub Stars program Marketplace Pricing Plans → Compare plans Contact Sales Nonprofit → Education → In this repository All GitHub ↵ Jump to ↵ No suggested jump to results In this repository All GitHub ↵ Jump to ↵ In this repository All GitHub ↵ Jump to ↵ Sign in Sign up {{ message }} swibcon / swib20 Watch 2 Star 4 Fork 9 Code Issues 0 Pull requests 0 Actions Projects 0 Security Insights More Code Issues Pull requests Actions Projects Security Insights Permalink Dismiss Join GitHub today GitHub is home to over 50 million developers working together to host and review code, manage projects, and build software together. Sign up GitHub is where the world builds software Millions of developers and companies build, ship, and maintain their software on GitHub — the largest and most advanced development platform in the world. Sign up for free Dismiss main swib20/map/read_participants_data.pl Go to file Go to file T Go to line L Copy path     Cannot retrieve contributors at this time 182 lines (154 sloc) 5.11 KB Raw Blame #!/usr/bin/perl # jneubert, 2020-11-07 # Ad-hoc script for preparing swib participants data for institution # reconsiliation against Wikidata and for aggregating the results and posting # them into a SPARQL endpoint. # If no WDQS-approved endpoint is available, the data can be injected by a # values clause - see below. # TODO: Split script into two parts # 1) read conftool data and transform to Openrefine input # (taking into account prior results) # 2) aggregate results and load into sparql endpoint # TODO: Do not use CSV with '|' delimiter - it is used in real-world # organization names. Better use TSV for input files, too. # TODO: Replace string 'swib20' with a variable use strict; use warnings; use utf8; use Data::Dumper; use File::Slurp; use HTML::Template; use IPC::Open2; use JSON::XS; use Path::Tiny; use POSIX; use Readonly; use XML::LibXML; binmode( STDOUT, ":utf8" ); $Data::Dumper::Sortkeys = 1; Readonly my $INPUT_FILE => '../var/src/participants.xml'; Readonly my $REFINE_DIR => path('../var/org_refine'); # # read ConfTool data from registered participants # my $parser = XML::LibXML->new(); my $doc = $parser->parse_file($INPUT_FILE); my @participants = $doc->getElementsByTagName('participant'); my ( %data, $participants_count ); foreach my $participant (@participants) { my $country = $participant->findvalue('country'); my $city = $participant->findvalue('city'); my $org_name = $participant->findvalue('organisation'); # create a lookup key including country, to deal with possible # ambiguous org names my $key = "$country!$org_name"; $data{$key}{count}++; $data{$key}{country} = $country; $data{$key}{city} = $city; $data{$key}{org_name} = $org_name; $participants_count++; } # # read identified organizations (results from prior openrefine loops) # my ( %organization, %known_key ); my @refine_files = $REFINE_DIR->children(qr/^swib20_org\d([a-z])?\.tsv$/); foreach my $refine_file (@refine_files) { my $lines = $refine_file->slurp_utf8; foreach my $line ( split( /\n/, $lines ) ) { my ( $key, $qid ) = split( /\t/, $line ); next unless $qid; $organization{$qid}{count} += $data{$key}{count}; $known_key{$key} = $qid; my ( $country, $name ) = split( '!', $key ); push( @{ $organization{$qid}{names} }, $name ); } } ##print Dumper \%organization; # # create and exeecute sparql queries for endpoint update # # q&d, use econ_corp endpoint, which is allowed in a WDQS service clause. # The endpoint can be updated only locally on the remote sparql_server # (which must be accessible by key authentication). Via Perl's open2(), STDIN # is used for data transmission. my $remote_cmd = 'ssh sparql_server curl --silent ' . '-X POST -H \"Content-type: application/sparql-update\" ' . '--data-binary @- http://localhost:3030/econ_corp/update'; my $delete_query = <<'EOF'; PREFIX wd: PREFIX zbwext: delete { ?item zbwext:swib20participants ?count . } where { ?item zbwext:swib20participants ?count . } EOF # execute delete query remotely my $child_in; open2( undef, $child_in, $remote_cmd ); print $child_in $delete_query; close $child_in; my $insert_query = <<'EOF'; P+REFIX wd: PREFIX zbwext: insert { ?item zbwext:swib20participants ?count . } where { values ( ?item ?count ) { EOF my ( $unknown_count, $no_org_count ); foreach my $qid ( keys %organization ) { # skip not-yet-determined and undefined entities if ( $qid eq 'Q59496158' ) { $unknown_count += $organization{$qid}{count}; next; } if ( $qid eq 'Q7883029' ) { $no_org_count += $organization{$qid}{count}; next; } $insert_query .= " ( wd:$qid $organization{$qid}{count} )\n"; # print line for a values clause in a static query, e.g. # https://www.wikidata.org/wiki/User:Jneubert/SWIB_queries/SWIB_institutions_extended ##print "( wd:$qid $organization{$qid}{count} )\n"; } $insert_query .= <<'EOF'; } } EOF # execute insert query remotely open2( undef, $child_in, $remote_cmd ); print $child_in $insert_query; close $child_in; # # output unidentified organizations (for next openrefine loop) # # column structure my $refine_input = "key|country|city|name|bt|organisation\n"; my $not_yet_known_count = 0; foreach my $key ( sort keys %data ) { next if $known_key{$key}; $refine_input .= "$key|$data{$key}{country}|$data{$key}{city}" . "|$data{$key}{org_name}||$data{$key}{org_name}\n"; $not_yet_known_count++; } path("../var/org_refine/swib20_org_input.csv")->spew_utf8($refine_input); # # output statistics # print "\n"; print $participants_count, " total participants\n"; ## minus items which are not an organization print scalar( keys %organization ) - 2, " institutions identified in Wikidata\n"; print $unknown_count, " institutions not (yet) in Wikidata\n"; print $no_org_count, " no institution given\n"; print $not_yet_known_count, " new institutions not yet known, in ../var/org_refine/swib20_org_input.csv\n"; print "\n"; print 'Not in Wikidata: ', join( '; ', @{ $organization{'Q59496158'}{names} } ), "\n"; Copy lines Copy permalink View git blame Reference in new issue Go © 2020 GitHub, Inc. Terms Privacy Cookie Preferences Security Status Help Contact GitHub Pricing API Training Blog About You can’t perform that action at this time. You signed in with another tab or window. Reload to refresh your session. You signed out in another tab or window. Reload to refresh your session. We use optional third-party analytics cookies to understand how you use GitHub.com so we can build better products. Learn more. Accept Reject We use optional third-party analytics cookies to understand how you use GitHub.com so we can build better products. You can always update your selection by clicking Cookie Preferences at the bottom of the page. For more information, see our Privacy Statement. Essential cookies We use essential cookies to perform essential website functions, e.g. they're used to log you in. Learn more Always active Analytics cookies We use analytics cookies to understand how you use our websites so we can make them better, e.g. they're used to gather information about the pages you visit and how many clicks you need to accomplish a task. Learn more Accept Reject Save preferences