PDFBox searching

August 22, 2019 2-minute read

Sometimes you need to search through bank statements looking for that Sports Basement receipt you need to replace some expensive hiking / climbing gear which is untracked by your own or their systems… That’s what took me down a path to creating this little gem for doing so. The version I created after this leveraged a fuzzy search to examine document payloads instead of string literals.

@Grapes([
  @Grab(group='org.apache.pdfbox', module='pdfbox', version='2.0.16'),
  @Grab(group='org.apache.tika', module='tika-core', version='1.22')
])

import org.apache.pdfbox.pdmodel.PDDocument
import org.apache.pdfbox.text.PDFTextStripper
import org.apache.tika.Tika
import java.util.concurrent.ThreadPoolExecutor
import java.util.concurrent.TimeUnit
import java.util.concurrent.LinkedBlockingQueue
import java.util.concurrent.ThreadFactory
import java.lang.ThreadLocal

import static groovy.io.FileType.FILES

def printerr = System.err.&println

def cli = new CliBuilder(header: 'PDF Text Extractor', usage:'./pdfTextExtractor <directoryToScan> <phrase>', width: 100)

def cliOptions = cli.parse(args)

if (cliOptions.help || cliOptions.arguments().size() != 2) {
  cli.usage()
  System.exit(0)
}

class TextExtractor {
  static String requiredMimeType = 'application/pdf'
  def pdfStripper = new PDFTextStripper()
  def tika = new Tika()

  void processFile(File file) {

    try {

      if (tika.detect(file) == requiredMimeType) {
        def document = PDDocument.load(file)
        def text = pdfStripper.getText(document)

        text.eachLine { line ->

         if (line.contains(cliOptions.arguments().last())) {
            println line
          }
        }

        document.close()
      }
    } catch (Exception e) {
      printerr "Error processing file ${file.path}"
    }
  }
}

class ReusableThread extends Thread {
  static def extractor = ThreadLocal.withInitial({
    new TextExtractor()
  })
  ReusableThread(Runnable runnable) {
    super(runnable)
  }
  void run() {
    super.run()
  }
}

class ReusableThreadFactory implements ThreadFactory {
  Thread newThread(Runnable runnable) {
    new ReusableThread(runnable)
  }
}

def executor = new ThreadPoolExecutor(5, 50, 5, TimeUnit.SECONDS, new LinkedBlockingQueue(50), new ReusableThreadFactory(), new ThreadPoolExecutor.CallerRunsPolicy())

new File(cliOptions.arguments().first()).eachFileRecurse(FILES) { file ->

  executor.submit {

    def extractor = ReusableThread.extractor.get()
    extractor.processFile(file)
  }
}

executor.shutdown()

groovy DocumentScanner.groovy ~/Documents/receipts "sports basement" > results.txt 2>/dev/null & tail -100f results.txt