# (C) 2008 DFKI Language Technology Lab http://www.dfki.de/lt # Project HyLaP; Author: Torsten Marek # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License 2.1 as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA __all__ = ("is_complete_paragraph", "unhyphenize", "load_wordlist") from sets import Set as set from codecs import open import re def is_complete_paragraph(text): return text.rstrip().endswith(".") first_word = re.compile(r"(.+?)\b", re.UNICODE) def unhyphenize(lines, wordlist = None): def check_endings(): for idx in xrange(0, len(lines) - 1): current = lines[idx].strip() if current.endswith("-"): m = first_word.match(lines[idx + 1]) if m is None: sfx = "" else: sfx = m.groups()[0] pfx = current.split(" ")[-1].lower()[:-1] # arrsplit! if exists(pfx + sfx): yield current[:-1] else: yield current else: yield "%s " % (current,) yield lines[-1] if wordlist is None: exists = lambda x: True else: exists = lambda x: x in wordlist return "".join(check_endings()) def load_wordlist(filename): try: inp = open(filename, "r") words = set() for word in inp: words.add(word.strip().lower()) return words finally: inp.close()