#!/usr/bin/env perl
# xhtml2metadata.pl - given a well-formed and valid XHTML file, output a stream of bibliographic metadata in the form of TSV
# Eric Lease Morgan
# (c) University of Notre Dame; distributed under a GNU Public License
# April 25, 2024 - first documentation, but probably written about a month ago
# configure
use constant PATH => '/html/head/meta';
# require
use strict;
use XML::Parser;
use XML::XPath;
# get input and sanity check
my $html = @ARGV[ 0 ];
if ( ! $html ) {
warn "Usage: $0 \n";
exit;
}
# initialize; tricky
my $parser = XML::XPath->new( parser => XML::Parser->new( NoLWP=>1 ), filename=>$html );
# get and process each metadatda element
my $elements = $parser->find( PATH );
my $found = 'false';
my %metadata = ();
my @subjects = ();
foreach my $element ( $elements->get_nodelist ) {
# update flag
$found = 'true';
# parse
my $name = $element->getAttribute( 'name' );
my $content = $element->getAttribute( 'content' );
# update
if ( $name eq 'dc.subject' ) { push( @subjects, $content ) }
else { $metadata{ $name } = $content }
}
# if good content was found, then process it
if ( $found eq 'true' ) {
# add the subject values
$metadata{ 'dc.subject' } = join( '; ', @subjects );
# map
my $identifier = $metadata{ 'dc.identifier' };
my $creator = $metadata{ 'dc.creator' };
my $title = $metadata{ 'dc.title' };
my $date = $metadata{ 'dc.date' };
my $description = $metadata{ 'dc.description' };
my $subjects = $metadata{ 'dc.subject' };
my $provenance = $metadata{ 'dc.provenance' };
# map some more
my $carrel = ( split( '/', $identifier ) )[ -1 ];
# output and done
print join( "\t", ( $carrel, $identifier, $creator, $title, $date, $description, $subjects, $provenance ) ) . "\n" ;
}
# done
exit;