# (C) 2008 DFKI Language Technology Lab http://www.dfki.de/lt # Project HyLaP; Author: Torsten Marek # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License 2.1 as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA from itertools import chain import subprocess from bibtex2xml import bibtexwasher, bibtexdecoder from cStringIO import StringIO from etree_utils import E, ET, write_tree def to_author_name(a): try: first, last = a.rsplit(" ", 1) return "%s, %s" % (last, first) except ValueError: return a def _get_complete_text(article_file): for event, element in ET.iterparse(article_file): if element.tag == "abstract" and element.text: yield (element.text, 0) elif element.tag == "box" and element.text: yield (element.text, int(element.get("page"))) def get_complete_text(article_file): return list(_get_complete_text(article_file)) def get_abstract_text(article_file): for event, element in ET.iterparse(article_file): if element.tag == "abstract": return (element.text, 0) def from_bibtex(filepath): try: fd = open(filepath, 'r') filecontents_source = fd.readlines() fd.close() except: print 'Could not open file:', filepath washeddata = bibtexwasher(filecontents_source) outdata = bibtexdecoder(washeddata) recoder = subprocess.Popen(["/usr/bin/recode", "Latex..UTF-8"], shell = False, stdin = subprocess.PIPE, stdout = subprocess.PIPE) r = [u'\n'] for line in outdata: recoder.stdin.write(line) recoder.stdin.close() r.append(unicode(recoder.stdout.read(), "UTF-8")) recoder.stdout.close() r.append(u"") return "".join(r) class Article(object): def __init__(self, bibtex_file = None): if bibtex_file is not None: self._bibtex = from_bibtex(bibtex_file).encode("UTF-8") else: self._bibtex = None self.main_text = [] self.title = u"" self.authors = [] self.abstract = [] self.conclusion = [] self.references = [] self._targets = { "abstract" : (self.abstract, True), "references" : (self.references, True), } def _get_header(self): if self._bibtex is not None: yield E.header( ET.fromstring(self._bibtex), E.abstract(*[a[0] for a in self.abstract])) else: yield E.header( E(ET.QName("http://bibtexml.sf.net/", "file"), E(ET.QName("http://bibtexml.sf.net/", "entry"), E(ET.QName("http://bibtexml.sf.net/", "inproceedings"), E(ET.QName("http://bibtexml.sf.net/", "title"), self.title), *[E(ET.QName("http://bibtexml.sf.net/", "author"), to_author_name(a.strip())) for a in self.authors]))), E.abstract(*[a[0] for a in self.abstract]), ) def _get_article_body(self): yield E.body( E.main(*[E.box(t[0], page=str(t[1])) for t in self.main_text]), E.conclusion("\n".join(c[0] for c in self.conclusion))) def get_target(self, text): for header_content in self._targets: if text.find(header_content) > -1: return (True, ) + self._targets[header_content] return (False, self.main_text, False) def to_xml(self): return E.article( *chain(self._get_header(), self._get_article_body()) )