#!/bin/bash
  #
  #
  ##############################################################################
  #
  #                                   xsane2tess 1.0
  #
  #                          *** tesseract made simple ***
  #
  #
  ##############################################################################
  # 
  # xsane2tess is a TesseractOCR wrapper to be able to use tesseract with xsane
  #
  #
  #
  TEMP_DIR=/tmp/      # folder for temporary files (TIFF & tesseract data)
  ERRORLOG="xsane2tess.log"  # file where STDERR goes 
  
  if [[ -z "$1"  ]]
    then
    echo "Usage: $0 [OPTIONS]
  
    xsane2tess converts files to TIF, scans them with TesseractOCR
    and outputs the text in a file.
  
    OPTIONS:
      -i <file1>  define input file (any image-format supported)
      -o <file2>  define output-file (*.txt)
      -l <lang>  define language-data tesseract should use
  
    Progress- & error-messages will be stored in this logfile:
       $TEMP_DIR$ERRORLOG
  
    xsane2tess depends on
      - ImageMagick  http://www.imagemagick.org/
      - TesseractOCR http://code.google.com/p/tesseract-ocr/
  
    Some coding was stolen from 'ocube'
    http://www.geocities.com/thierryguy/ocube.html
  "
    exit
  fi
  
  
  # get options...
  while getopts ":i:o:l:" OPTION
    do
    case $OPTION in 
      i)  # input filename (with path)
        FILE_PATH="$OPTARG"
      ;;
      o )  # output filename
        FILE_OUT="$OPTARG"
      ;;
      l )  # Language-selection
        TES_LANG="$OPTARG"
      ;;
    esac
  done
  
  # redirect STDOUT to FILE_OUT
  exec 1>>$FILE_OUT
  
  # redirect STDERR to ERRORLOG
  exec 2>>$TEMP_DIR$ERRORLOG
  
  # strip path from FILE_PATH, use filename only
  IN_FILE=${FILE_PATH##*/}
  
  TIF_FILE="$TEMP_DIR""${IN_FILE%.*}".tif
  TXT_FILE="$TEMP_DIR""${IN_FILE%.*}"
  
  # converting image into TIFF (ImageMagick)
  convert "$FILE_PATH" -compress none  "$TIF_FILE" 1>&2
  
  # start OCR (tesseract expands output with *.txt)
  tesseract "$TIF_FILE" "$TXT_FILE" -l "$TES_LANG" 1>&2
  
  # STDOUT scanned text => FILE_OUT
  cat "$TXT_FILE".txt
  
  # delete graphic file after use
  rm "$TIF_FILE"
  
  # delete tesseract output
  rm "$TXT_FILE".txt