# (C) 2008  DFKI Language Technology Lab http://www.dfki.de/lt
#     Project HyLaP;  Author: Torsten Marek
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License 2.1 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
# USA

__all__ = ("is_complete_paragraph", "unhyphenize", "load_wordlist")

from sets import Set as set
from codecs import open
import re

def is_complete_paragraph(text):
    return text.rstrip().endswith(".")


first_word = re.compile(r"(.+?)\b", re.UNICODE)

def unhyphenize(lines, wordlist = None):
    def check_endings():
        for idx in xrange(0, len(lines) - 1):
            current = lines[idx].strip()
            
            if current.endswith("-"):
                m = first_word.match(lines[idx + 1])
                if m is None:
                    sfx = ""
                else:
                    sfx = m.groups()[0]
                pfx = current.split(" ")[-1].lower()[:-1] # arrsplit!
                if exists(pfx + sfx):
                    yield current[:-1]
                else:
                    yield current
            else:
                yield "%s " % (current,)
        yield lines[-1]

    if wordlist is None:
        exists = lambda x: True
    else:
        exists = lambda x: x in wordlist
        
    return "".join(check_endings())


def load_wordlist(filename):
    try:
        inp = open(filename, "r")
        words = set()
        for word in inp:
            words.add(word.strip().lower())
        return words
    finally:
        inp.close()