ReformatLegalPdf.sh

Introduction

ReformatLegalPdf.sh is a shell script which converts legal PDFs (SCOTUS, CAFC) to HTML. This script is used by Groklaw.net, for example in Groklaw: Final Bilski Briefs Filed - Microsoft, Google, FFII, ABA, etc.

Download

Download the shell script ReformatLegalPdf.sh here.

Requirements

You need to have the following software installed on your machine:

  1. one
  2. two
  3. three

Code

#!/bin/sh
#
# This codelet attempts to convert a legal pdf document to crude html.
# It attempts to locate footnotes, and to re-route them to the end of
# the document, wrapped in some html to turn them into hyperlinks.
#
# It uses the fact that apparently, documents generated for the Federal
# Circuit Court in Utah tends to format footnotes with a numeral that
# follows 5-10 spaces at the start of a line.  If this convention is
# violated, this code may not do the right thing.
#
# Incidentally, it should be clear that this can't possibly work with
# scanned documents.  Only pdf generated by converting a word-processed
# (or LaTeX-ed) document will suit the case.
#
# pdftotext (part of the xpdf package) must be installed for this script
# to work.  In addition, the awk invoked here is GNU awk, which should be
# kept in mind by people attempting to port the script away from linux.
#
# The invocation should be something like
#
#  reformat_legal_pdf.sh IBM_PSJ_GAMEOVER.pdf IBM_PSJ_GAMEOVER.html
#
# This will convert IBM_PSJ_GAMEOVER.pdf and put the result in
# IBM_PSJ_GAMEOVER.html.
#
# Carlo Graziani, 1 July 2006.  Anyone may use and modify and redistribute,
# for any purpose, without conditions.

[ $# -eq 2 ] || {

  echo "USAGE:"
  echo "reformat_legal_pdf.sh FILE.pdf FILE.html"
  exit 1
}

[ -f $1 ] || {

  echo "$1: File not found"
  exit 2
}

footnote_file=footnotes.$$
text_file=text.$$

# First convert the PDF to plain text (with some formatting preserved).
pdftotext -layout $1 $text_file || {

  echo "Can't convert $1 to text."
  exit 3
}

# Now we break out the footnotes from the body
# The top line is an unwanted running head
rm -f $footnote_file $2 
# printf "nn

n" > $2
printf "n" > $2
tail +2 $text_file | 

# In the awklet below:
# footnote_section=0 means we're in main text
# footnote_section=1 means we're in a footnote section
# footnote_section=2 means we're at the page number
# footnote_section=3 means we're at the end-of-page form feed
# footnote_section=4 means we're at the running head of the next page.
awk --posix -v ff=$footnote_file 
            'BEGIN{footnote_section=0 ; buffer="" ; empty_line=0 ; empty_line_prev=0}
             /^ {5,10}[[:digit:]]+$/ {footnote_section=1}
             /^ {20,}[[:digit:]]+$/ {footnote_section=2 ;
                                     printf "

                                                                                                         %s
",
                                            $1}
             /f/ {footnote_section=3}
             !/f/ && footnote_section == 3 {footnote_section=4}
             footnote_section == 0 {if ((match($0,"^   +") != 0 || empty_line_prev == 1) && empty_line == 1)
                                       print "

" $0;
                      else print $0} 
         footnote_section == 1 {buffer = buffer "n" $0} 
         footnote_section == 4 {footnote_section=0} 
         {empty_line_prev=empty_line ; if ( match($0, "^$") ) empty_line=1; else empty_line=0} 
         END{print buffer >> ff} 
         ' >> $2 

# At this stage, the body has been output to $2, whereas the raw footnotes 
# are in $footnote_file. We now dress up the footnotes in html, and 
# append them to $2. 
cat $footnote_file | 
awk --posix 'BEGIN{first=1}
                     /^ {5,10}[[:digit:]]+$/ { if ( first == 1 ) { first = 0 ; 
                                             printf "n

n%dn", $1, $1, $1} else 
               printf "n

n%dn", 
                    $1, $1, $1} 
               !/^ {5,10}[[:digit:]]+$/ {print} 
               ' >> $2 

# Clean up rm $text_file $footnote_file 

***************************** 

#!/bin/sh 

export PATH=${PATH}:/bin:/usr/bin:/usr/bin:/usr/local/bin 
export PATH=${PATH}:/sbin:/usr/sbin:/usr/local/sbin 

ADMDIR=/var/local/pdf_convert 
WORKDIR=$ADMDIR/work.$$ 
MAILFILE=$WORKDIR/mail 
LOGFILE=$WORKDIR/log 
UNPACKDIR=$WORKDIR/unpack 
NOTIFY_FILE=$WORKDIR/notify 
VETTED_LIST="pj@groklaw.com pj@groklaw.net carlo@oddjob.uchicago.edu" 
ADMIN="carlo@oddjob.uchicago.edu" 

[ -d $WORKDIR ] && rm -rf $WORKDIR 
mkdir $WORKDIR 

# Log to logfile
 exec 1>>$LOGFILE 2>&1 

# Write the mail file 
cat > $MAILFILE 

# Verify that the sender is vetted for using this service
 sender=`cat $MAILFILE | formail -rt | head -n 1 | awk '{print $2}'` allowed= 
for allowed_sender in $VETTED_LIST ; do 

if [ "$sender" = "$allowed_sender" ]; then 
  allowed=1 
  break 
fi 

done 

if [ ! "$allowed" ] ; then 
  echo "Offending message in $MAILFILE" | 
   mail -s "Illegal sender to pdf conversion server" $ADMIN 
  exit 1 

fi 

# unpack pdfs 
mkdir -p $UNPACKDIR 
cd $UNPACKDIR 
cat $MAILFILE | uudeview -t - 

npdf=0 
uuenview_filelist="" 
# Some shell nonsense to deal with the possibility that unpacked 
# filenames may contain spaces. 
inodes=`ls -i1| awk '{print $1}'` 
for i in $inodes ; do 

  unpacked_file="`find . -inum $i`" 
  uf=`echo $unpacked_file | tr " " "_"` 
  [ "$unpacked_file" = "$uf" ] || { 
     mv "$unpacked_file" "$uf" ; unpacked_file="$uf" 
  } 

#Look for PDF attachments to convert 
  file_type=`file -b "$unpacked_file"` 

  ( echo $file_type | grep -q PDF ) &/^ {5,10}[[:digit:]]+$ {footnote_section=1} 
     /^ {20,}[[:digit:]]+$/ {footnote_section=2 ; printf "

                                                                                                          %s
",
                                     $1}
             /f/ {footnote_section=3}
             !/f/ && footnote_section == 3 {footnote_section=4}
             footnote_section == 0 {if ((match($0,"^   +") != 0 || empty_line_prev == 1) && empty_line == 1)
                                       print "

" $0; 
                     else print $0}
           footnote_section == 1 {buffer = buffer "n" $0} 
           footnote_section == 4 {footnote_section=0} 
           {empty_line_prev=empty_line ; if ( match($0, "^$") ) empty_line=1; else empty_line=0} 
           END{print buffer >> ff} 
           ' >> $unpacked_file.html 

# Then we dress up the footnotes in some html. 
  cat $footnote_file | 
  awk --posix 'BEGIN{first=1} 
             /^ {5,10}[[:digit:]]+$/ { if ( first == 1 ) { first = 0 ;
                                       printf "n

n%dn", 
                       $1, $1, $1} else 
                           printf "
n

n%dn", 
                     $1, $1, $1} 
              !/^ {5,10}[[:digit:]]+$/ {print} 
              ' >> $unpacked_file.html 

# Add the generated files to the list of files to be returned. 
   uuenview_filelist="$uuenview_filelist $unpacked_file.html" } 

done 

[ $npdf -ne 0 ] &|| { 

   echo "From: PDF Robot " > $NOTIFY_FILE 
   echo "Reply-To: $ADMIN" >> $NOTIFY_FILE 
   echo "To: $sender, $ADMIN" >> $NOTIFYFILE 
   echo "Subject: PDF Conversion Anomaly" >> $NOTIFY_FILE 
   echo "" >> $NOTIFY_FILE 
   echo "No PDF files were found in your message." >> $NOTIFY_FILE 
   sendmail $sender $ADMIN