ReformatLegalPdf.sh
Introduction
ReformatLegalPdf.sh is a shell script which converts legal PDFs (SCOTUS, CAFC) to HTML. This script is used by Groklaw.net, for example in Groklaw: Final Bilski Briefs Filed - Microsoft, Google, FFII, ABA, etc.
Download
Download the shell script ReformatLegalPdf.sh here.
Requirements
You need to have the following software installed on your machine:
- one
- two
- three
Code
#!/bin/sh
#
# This codelet attempts to convert a legal pdf document to crude html.
# It attempts to locate footnotes, and to re-route them to the end of
# the document, wrapped in some html to turn them into hyperlinks.
#
# It uses the fact that apparently, documents generated for the Federal
# Circuit Court in Utah tends to format footnotes with a numeral that
# follows 5-10 spaces at the start of a line. If this convention is
# violated, this code may not do the right thing.
#
# Incidentally, it should be clear that this can't possibly work with
# scanned documents. Only pdf generated by converting a word-processed
# (or LaTeX-ed) document will suit the case.
#
# pdftotext (part of the xpdf package) must be installed for this script
# to work. In addition, the awk invoked here is GNU awk, which should be
# kept in mind by people attempting to port the script away from linux.
#
# The invocation should be something like
#
# reformat_legal_pdf.sh IBM_PSJ_GAMEOVER.pdf IBM_PSJ_GAMEOVER.html
#
# This will convert IBM_PSJ_GAMEOVER.pdf and put the result in
# IBM_PSJ_GAMEOVER.html.
#
# Carlo Graziani, 1 July 2006. Anyone may use and modify and redistribute,
# for any purpose, without conditions.
[ $# -eq 2 ] || {
echo "USAGE:"
echo "reformat_legal_pdf.sh FILE.pdf FILE.html"
exit 1
}
[ -f $1 ] || {
echo "$1: File not found"
exit 2
}
footnote_file=footnotes.$$
text_file=text.$$
# First convert the PDF to plain text (with some formatting preserved).
pdftotext -layout $1 $text_file || {
echo "Can't convert $1 to text."
exit 3
}
# Now we break out the footnotes from the body
# The top line is an unwanted running head
rm -f $footnote_file $2
# printf "nn
n" > $2
printf "n" > $2
tail +2 $text_file |
# In the awklet below:
# footnote_section=0 means we're in main text
# footnote_section=1 means we're in a footnote section
# footnote_section=2 means we're at the page number
# footnote_section=3 means we're at the end-of-page form feed
# footnote_section=4 means we're at the running head of the next page.
awk --posix -v ff=$footnote_file
'BEGIN{footnote_section=0 ; buffer="" ; empty_line=0 ; empty_line_prev=0}
/^ {5,10}[[:digit:]]+$/ {footnote_section=1}
/^ {20,}[[:digit:]]+$/ {footnote_section=2 ;
printf "
%s
",
$1}
/f/ {footnote_section=3}
!/f/ && footnote_section == 3 {footnote_section=4}
footnote_section == 0 {if ((match($0,"^ +") != 0 || empty_line_prev == 1) && empty_line == 1)
print "
" $0;
else print $0}
footnote_section == 1 {buffer = buffer "n" $0}
footnote_section == 4 {footnote_section=0}
{empty_line_prev=empty_line ; if ( match($0, "^$") ) empty_line=1; else empty_line=0}
END{print buffer >> ff}
' >> $2
# At this stage, the body has been output to $2, whereas the raw footnotes
# are in $footnote_file. We now dress up the footnotes in html, and
# append them to $2.
cat $footnote_file |
awk --posix 'BEGIN{first=1}
/^ {5,10}[[:digit:]]+$/ { if ( first == 1 ) { first = 0 ;
printf "n
n%dn", $1, $1, $1} else
printf "n
n%dn",
$1, $1, $1}
!/^ {5,10}[[:digit:]]+$/ {print}
' >> $2
# Clean up rm $text_file $footnote_file
*****************************
#!/bin/sh
export PATH=${PATH}:/bin:/usr/bin:/usr/bin:/usr/local/bin
export PATH=${PATH}:/sbin:/usr/sbin:/usr/local/sbin
ADMDIR=/var/local/pdf_convert
WORKDIR=$ADMDIR/work.$$
MAILFILE=$WORKDIR/mail
LOGFILE=$WORKDIR/log
UNPACKDIR=$WORKDIR/unpack
NOTIFY_FILE=$WORKDIR/notify
VETTED_LIST="pj@groklaw.com pj@groklaw.net carlo@oddjob.uchicago.edu"
ADMIN="carlo@oddjob.uchicago.edu"
[ -d $WORKDIR ] && rm -rf $WORKDIR
mkdir $WORKDIR
# Log to logfile
exec 1>>$LOGFILE 2>&1
# Write the mail file
cat > $MAILFILE
# Verify that the sender is vetted for using this service
sender=`cat $MAILFILE | formail -rt | head -n 1 | awk '{print $2}'` allowed=
for allowed_sender in $VETTED_LIST ; do
if [ "$sender" = "$allowed_sender" ]; then
allowed=1
break
fi
done
if [ ! "$allowed" ] ; then
echo "Offending message in $MAILFILE" |
mail -s "Illegal sender to pdf conversion server" $ADMIN
exit 1
fi
# unpack pdfs
mkdir -p $UNPACKDIR
cd $UNPACKDIR
cat $MAILFILE | uudeview -t -
npdf=0
uuenview_filelist=""
# Some shell nonsense to deal with the possibility that unpacked
# filenames may contain spaces.
inodes=`ls -i1| awk '{print $1}'`
for i in $inodes ; do
unpacked_file="`find . -inum $i`"
uf=`echo $unpacked_file | tr " " "_"`
[ "$unpacked_file" = "$uf" ] || {
mv "$unpacked_file" "$uf" ; unpacked_file="$uf"
}
#Look for PDF attachments to convert
file_type=`file -b "$unpacked_file"`
( echo $file_type | grep -q PDF ) &/^ {5,10}[[:digit:]]+$ {footnote_section=1}
/^ {20,}[[:digit:]]+$/ {footnote_section=2 ; printf "
%s
",
$1}
/f/ {footnote_section=3}
!/f/ && footnote_section == 3 {footnote_section=4}
footnote_section == 0 {if ((match($0,"^ +") != 0 || empty_line_prev == 1) && empty_line == 1)
print "
" $0;
else print $0}
footnote_section == 1 {buffer = buffer "n" $0}
footnote_section == 4 {footnote_section=0}
{empty_line_prev=empty_line ; if ( match($0, "^$") ) empty_line=1; else empty_line=0}
END{print buffer >> ff}
' >> $unpacked_file.html
# Then we dress up the footnotes in some html.
cat $footnote_file |
awk --posix 'BEGIN{first=1}
/^ {5,10}[[:digit:]]+$/ { if ( first == 1 ) { first = 0 ;
printf "n
n%dn",
$1, $1, $1} else
printf "
n
n%dn",
$1, $1, $1}
!/^ {5,10}[[:digit:]]+$/ {print}
' >> $unpacked_file.html
# Add the generated files to the list of files to be returned.
uuenview_filelist="$uuenview_filelist $unpacked_file.html" }
done
[ $npdf -ne 0 ] &|| {
echo "From: PDF Robot " > $NOTIFY_FILE
echo "Reply-To: $ADMIN" >> $NOTIFY_FILE
echo "To: $sender, $ADMIN" >> $NOTIFYFILE
echo "Subject: PDF Conversion Anomaly" >> $NOTIFY_FILE
echo "" >> $NOTIFY_FILE
echo "No PDF files were found in your message." >> $NOTIFY_FILE
sendmail $sender $ADMIN