#!/usr/bin/env bash # build-journals.sh - using Slurm, create journal carrels # Eric Lease Morgan # (c) University of Notre Dame; distributed under a GNU Public License # April 6, 2024 - first documentation, moving to Slurm, but hacked on about a month ago # April 8, 2024 - migrated various modeling tasks off # April 16, 2024 - tweaked for journals # configure the batch #SBATCH --job-name=journals #SBATCH --nodes=1 #SBATCH --ntasks=1 #SBATCH --cpus-per-task=32 #SBATCH --array=0-710 #SBATCH --output=./logs/%A_%a.out #SBATCH --error=./logs/%A_%a.err # configure the job CARRELS='./etc/jobs.txt' ITEM=$SLURM_ARRAY_TASK_ID LIBRARY='/shared/carrels' CACHE='./caches' LEGACY='/shared/journals' JAVA='/shared/java/bin' ZIP='index.zip' TMP='./tmp' TINY='-2c' LARGE='+2M' MINIMUM=5 # initialize CARRELS=( $( cat $CARRELS ) ) CARREL=${CARRELS[$ITEM]} # make sane export PATH=$JAVA:$PATH mkdir -p $TMP cd $TMP # output a bit of debugging echo &>./$CARREL.log echo "The item is $ITEM, and the carrel is $CARREL." &>>./$CARREL.log echo &>>./$CARREL.log # check to see if the given carrel was already migrated if [[ ! -f $LIBRARY/$CARREL/$ZIP ]]; then # debug echo "$CARREL processing" &>>./$CARREL.log # initialize cp $LEGACY/$CARREL.zip ./$CARREL.zip unzip -o -d ./$CARREL ./$CARREL.zip &>>./$CARREL.log # skip over bogus archive if [[ ! -d ./$CARREL/$CACHE/$CARREL ]]; then echo "$CARREL seems to be broken. Call Eric." &>>./$CARREL.log exit fi # remove tiny files; they shouldn't be here in the first place find ./$CARREL/$CACHE/$CARREL -type f -size $TINY -exec rm {} \; # remove large files; they are likely to consist of multiple volumes find ./$CARREL/$CACHE/$CARREL-type f -size $LARGE -exec rm {} \; # skip over carrels with too few documents COUNT=$( ls ./$CARREL/$CACHE/$CARREL | wc -l ) if [[ $COUNT -lt $MINIMUM ]]; then echo "$CARREL is too small ($COUNT items); skipping." &>>./$CARREL.log exit fi # re-create the carrel rdr build $CARREL ./$CARREL/$CACHE/$CARREL -s -e &>>./$CARREL.log # add some modeling rdr cluster $CARREL -v &>>./$CARREL.log & rdr summarize $CARREL &>>./$CARREL.log & # hang out 'till everything is done wait # zip rdr zip $CARREL &>>./$CARREL.log # done echo "Rebuid done" &>>./$CARREL.log else # debug echo "$CARREL skipping; already processed" &>>./$CARREL.log fi # done exit