#!/usr/bin/env python2.5 # (C) 2008 DFKI Language Technology Lab http://www.dfki.de/lt # Project HyLaP; Author: Torsten Marek # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License 2.1 as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA import os import sys from optparse import OptionParser from pytake.pdf.pdfbox import convert_to_article, BadExtractionError from pytake.text_utils import load_wordlist def main(): op = OptionParser("%prog [options] input output") op.add_option("-b", "--bibtex", metavar="FILE", help="the bibtex entry for the article") op.add_option("-w", "--wordlist", metavar="FILE", help="a wordlist") options, args = op.parse_args() if options.wordlist: words = load_wordlist(options.wordlist) else: words = None try: convert_to_article(args[0], args[1], options.bibtex, keep=True, cache_dir=".", wordlist = words) except BadExtractionError: sys.stderr.write("Error: Could not extract text from '%s'.\n" % (args[0],)) main()