#!/usr/bin/env perl

# xhtml2metadata.pl - given a well-formed and valid XHTML file, output a stream of bibliographic metadata in the form of TSV

# Eric Lease Morgan <emorgan@nd.edu>
# (c) University of Notre Dame; distributed under a GNU Public License

# April 25, 2024 - first documentation, but probably written about a month ago


# configure
use constant PATH => '/html/head/meta';

# require
use strict;
use XML::Parser;
use XML::XPath;

# get input and sanity check
my $html = @ARGV[ 0 ];
if ( ! $html ) {

	warn "Usage: $0 <html>\n";
	exit;
}

# initialize; tricky
my $parser = XML::XPath->new( parser => XML::Parser->new( NoLWP=>1 ), filename=>$html ); 

# get and process each metadatda element
my $elements = $parser->find( PATH );
my $found    = 'false';
my %metadata = ();
my @subjects = ();
foreach my $element ( $elements->get_nodelist ) {

	# update flag
	$found = 'true';
	
	# parse
	my $name    = $element->getAttribute( 'name' );		
	my $content = $element->getAttribute( 'content' );
	
	# update
	if ( $name eq 'dc.subject' ) { push( @subjects, $content ) }
	else { $metadata{ $name } = $content }

}

# if good content was found, then process it
if ( $found eq 'true' ) {

	# add the subject values
	$metadata{ 'dc.subject' } = join( '; ', @subjects );

	# map
	my $identifier  = $metadata{ 'dc.identifier' };
	my $creator     = $metadata{ 'dc.creator' };
	my $title       = $metadata{ 'dc.title' };
	my $date        = $metadata{ 'dc.date' };
	my $description = $metadata{ 'dc.description' };
	my $subjects    = $metadata{ 'dc.subject' };
	my $provenance  = $metadata{ 'dc.provenance' };
	
	# map some more
	my $carrel = ( split( '/', $identifier ) )[ -1 ];
	
	# output and done
	print join( "\t", ( $carrel, $identifier, $creator, $title, $date, $description, $subjects, $provenance ) ) . "\n" ;
}

# done
exit;