#!/usr/bin/env perl # xhtml2metadata.pl - given a well-formed and valid XHTML file, output a stream of bibliographic metadata in the form of TSV # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # April 25, 2024 - first documentation, but probably written about a month ago # configure use constant PATH => '/html/head/meta'; # require use strict; use XML::Parser; use XML::XPath; # get input and sanity check my $html = @ARGV[ 0 ]; if ( ! $html ) { warn "Usage: $0 \n"; exit; } # initialize; tricky my $parser = XML::XPath->new( parser => XML::Parser->new( NoLWP=>1 ), filename=>$html ); # get and process each metadatda element my $elements = $parser->find( PATH ); my $found = 'false'; my %metadata = (); my @subjects = (); foreach my $element ( $elements->get_nodelist ) { # update flag $found = 'true'; # parse my $name = $element->getAttribute( 'name' ); my $content = $element->getAttribute( 'content' ); # update if ( $name eq 'dc.subject' ) { push( @subjects, $content ) } else { $metadata{ $name } = $content } } # if good content was found, then process it if ( $found eq 'true' ) { # add the subject values $metadata{ 'dc.subject' } = join( '; ', @subjects ); # map my $identifier = $metadata{ 'dc.identifier' }; my $creator = $metadata{ 'dc.creator' }; my $title = $metadata{ 'dc.title' }; my $date = $metadata{ 'dc.date' }; my $description = $metadata{ 'dc.description' }; my $subjects = $metadata{ 'dc.subject' }; my $provenance = $metadata{ 'dc.provenance' }; # map some more my $carrel = ( split( '/', $identifier ) )[ -1 ]; # output and done print join( "\t", ( $carrel, $identifier, $creator, $title, $date, $description, $subjects, $provenance ) ) . "\n" ; } # done exit;