#!/usr/bin/env python

from collections import defaultdict
from xml.etree import ElementTree as ET
import cStringIO as StringIO
from datetime import datetime
from contextlib import nested
from tempfile import gettempdir
import sys
import unicodedata
from os import path
import os
from SimpleXMLRPCServer import SimpleXMLRPCServer
from xmlrpclib import ServerProxy


#list of mappings courtesy of Jonathan Read
# see eg http://wiki.delph-in.net/moin/WeSearch/LexicalFiltering

#constructed on the basis of a confusion matrix between TnT (so not exactly PTB/GTB) and ERG
GTB_TO_ERG_POS_MAP = {
    'NN': 'n',
    'NNS': 'n',
    'NNP': 'n',
    'NNPS': 'n',
    'EX': 'n',
    'VB': 'v',
    'VBZ': 'v',
    'VBP': 'v',
    'VBN': 'v',
    'VBD': 'v',
    'VBG': 'v',
    'MD': 'v',
    'JJ': 'aj',
    'JJR': 'aj',
    'JJS': 'aj',
    'RB': 'av',
    'RBR': 'av',
    'RBS': 'av',
    'IN': 'p',
    'RP': 'p'
}

class GTBTreeBlazer(object):
    def __init__(self, match_le_types=False, incompatible_gold_action='ignore', 
            transformer=None, ignore_children_matching_parents=True):
        assert incompatible_gold_action in ('ignore', 'fail')
        self.incompatible_gold_action = incompatible_gold_action
        self.match_le_types = match_le_types
        self.tree_transformer = transformer
        self.ignore_children_matching_parents = ignore_children_matching_parents
            #(if a parent and child have the same node label, ignore the child)
    
    def filter(self, ext_tree, discrim_group):
        if self.tree_transformer:
            self.tree_transformer.transform(ext_tree)
        raw_text = discrim_group.raw_text
        charspan_to_discrims = defaultdict(set)
        discrim_encompassing_charspans = [set() for _ in discrim_group.raw_text]
        charspan_to_le_type_discrims = defaultdict(set)
        le_type_discrim_encompassing_charspans = [set() for _ in discrim_group.raw_text]
        for discrim in discrim_group.discriminants:
            charspan = (discrim.cfrom, discrim.cto)
            charspan_to_discrims[charspan].add(discrim)
            for i in range(discrim.cfrom, discrim.cto):
                discrim_encompassing_charspans[i].add(charspan)
            if self.match_le_types and discrim.type == 'type':
                charspan_to_le_type_discrims[charspan].add(discrim)
                for i in range(discrim.cfrom, discrim.cto):
                    le_type_discrim_encompassing_charspans[i].add(charspan)
        
        #XXX need to correct for punctuation diffs somewhere in here
        response = DiscriminantFilterResponse(discrim_group)
        for cons_path in ext_tree.all_paths():
            cons = cons_path[0]
            cons_parent = cons_path[1] if len(cons_path) > 1 else None
            
            if self.ignore_children_matching_parents:
                # if this is set, we ignore NPs that are immediately beneath NPs
                # which may avoid some spurious mismatches
                if cons_parent and cons.cat == cons_parent.cat:
                    continue
            
            char_start, char_end = cons.char_start, cons.char_end
            if cons.char_start is None or cons.char_end is None:
                continue #null constituent
            
            handled_enc_charspans = set()
            for char_idx in range(cons.char_start, cons.char_end):
                for enc_charspan in discrim_encompassing_charspans[char_idx]:
                    if enc_charspan in handled_enc_charspans:
                        continue # we've already handled this char span for this cons
                    else:
                        handled_enc_charspans.add(enc_charspan)
                    enc_start, enc_end = enc_charspan
                    if enc_start < cons.char_start and enc_end < cons.char_end:
                        disc_extra_head = raw_text[enc_start:cons.char_start]
                        disc_missing_tail = raw_text[enc_end:cons.char_end]
                        # if the only chars causing a conflict are punctuation, 
                        # ignore them:
                        extra_head_is_punct = all(is_punctuation(c) for c in disc_extra_head)
                        missing_tail_is_punct = all(is_punctuation(c) for c in disc_missing_tail)
                        is_conflict = not extra_head_is_punct and not missing_tail_is_punct
                        # (it's not a real conflict if either non-overlap is entirely punctuation)
                    elif enc_start > char_start and enc_end > char_end:
                        disc_missing_head = raw_text[enc_start:cons.char_start]
                        disc_extra_tail = raw_text[enc_end:cons.char_end]
                        # if the only chars causing a conflict are punctuation, 
                        # ignore them
                        missing_head_is_punct = all(is_punctuation(c) for c in disc_missing_head)
                        extra_tail_is_punct = all(is_punctuation(c) for c in disc_extra_tail)
                        is_conflict = not missing_head_is_punct and not extra_tail_is_punct
                    else:
                        is_conflict = False
                    if is_conflict:
                        for discrim in charspan_to_discrims[enc_charspan]:
                            response.reject(discrim, unicode(cons))
                            

        if self.match_le_types:
            handled_enc_charspans = set()
            for tok in ext_tree.all_tokens():
                for char_idx in range(tok.char_start, tok.char_end):
                    for enc_charspan in le_type_discrim_encompassing_charspans[char_idx]:
                        if enc_charspan in handled_enc_charspans:
                            continue # we've already handled this char span for this cons
                        else:
                            handled_enc_charspans.add(enc_charspan)
                        enc_start, enc_end = enc_charspan
                        try:
                            mapped_pos_prefix = GTB_TO_ERG_POS_MAP[tok.cat]
                        except KeyError:
                            continue
                        prefixes_to_discrims = defaultdict(set)
                        for discrim in charspan_to_discrims[(tok.char_start, tok.char_end)]:
                            if discrim.type == 'type':
                                min_start = min(tok.char_start, enc_start)
                                max_start = max(tok.char_start, enc_start)
                                non_overlap_start = raw_text[min_start:max_start]
                                min_end = min(tok.char_end, enc_end)
                                max_end = max(tok.char_end, enc_end)
                                non_overlap_end = raw_text[min_end:max_end]
                                exact_or_near_match = all(is_punctuation(c) for c in 
                                        non_overlap_start + non_overlap_end)
                                #make sure the only differences are punctuation between the two
                                if exact_or_near_match:
                                    prefix = discrim.key.split('_')[0]
                                    prefixes_to_discrims[prefix].add(discrim)
                        if len(prefixes_to_discrims) > 1 and mapped_pos_prefix not in prefixes_to_discrims:
                            # only reject if we have multiple prefixes here and at least one that agrees
                            for prefix, discrims in prefixes_to_discrims.iteritems():
                                if prefix != mapped_pos_prefix:
                                    for discrim in discrims:
                                        response.reject(discrim, unicode(tok))
                            # this may not handle some edge cases with unary branching conversion
                            # quite correctly - eg gerunds - but hopefully the number of cases where we have choices
                            #  and the exatly matching LE type (as opp the one that gets converted)
                            #  will not be too frequent
        return response
    
def is_punctuation(unichr):
    return unicodedata.category(unichr).startswith('P')


class RecursiveTransformer(object):
    """Recursively transforms constituents (bottom-up)"""
    
    def transform(self, tree):
        self._recursive_transform_constituents(tree.root_constituent)
    
    def _recursive_transform_constituents(self, constit):
        for sub_constit in constit.sub_constituents:
            self._recursive_transform_constituents(sub_constit)
        self._transform_constituent(constit)
    
    def _transform_constituent(self, constit):
        raise NotImplementedError()

class NounCompoundDeepener(RecursiveTransformer):
    """
    Transform an NP-internal contiguous sequence of (JJ(R|S)?|NNS?)+ into an NP
    (should probably be called something like an Nbar, but so should a few other
    structures, and we're not doing that so we'll at least be consistent)
    
    These can't be handled by BinarizePhraseTransformer as we don't want 
    to binarize necessarily, but
    simply (eg) split off the determiner, since we can't work out bracketing for eg 
    'massive bike sale', so we shouldn't make guesses about it
    
    However transforming with BinarizePhraseTransformer aftewards should 
    create sane PSTs
    
    We don't care about NPs with a single NN*, as these should
    be canonically decomposable using BinarizePhraseTransformer
    """
    
    def _transform_constituent(self, constit):
        phrase_type = constit.cat
        if phrase_type != 'NP':
            return
        sub_units = constit.sub_units
        num_nns = sum(1 for su in sub_units if su.cat in ('NN', 'NNS'))
        if num_nns < 2:
            return
        sub_unit_iter = iter(sub_units)
        seq_idxs = []
        curr_start = None
        curr_end = None
        in_jj_nn_seq = False
        idx = 0
        while True:
            prev_in_jj_nn_seq = in_jj_nn_seq
            try:
                next_unit = sub_unit_iter.next()
            except StopIteration:
                if prev_in_jj_nn_seq: #clean up the last one
                    seq_idxs.append((curr_start, curr_end))
                break
            if next_unit.kind == 'tok' and next_unit.cat in ('NN', 'NNS', 'JJ', 'JJR', 'JJS'):
                if not prev_in_jj_nn_seq:
                    in_jj_nn_seq = True
                    curr_start = idx
                curr_end = idx + 1
            else:
                if prev_in_jj_nn_seq:
                    seq_idxs.append((curr_start, curr_end))
                curr_start = None
                curr_end = None
                in_jj_nn_seq = False
            idx += 1
        sub_units_new = sub_units[:] # take a copy, and we can manipulate then give it back
        for (start, end) in seq_idxs:
            sub_units_new[start:end] = [TreeConstituent('NP', sub_units[start:end])]
        constit.replace_sub_units(sub_units_new)
    
    
class BinarizePhraseTransformer(RecursiveTransformer):
    """Converts a flat phrase with multiple children into a binary-branching nested
    phrase (where possible), leaving it untouched if there is uncertainty about the head phrase
    """
    phrase_head_mapping = { #these POSs, as well as the phrasal categories themselves, are considered heads
        'ADJP': set(['JJ', 'JJR', 'JJS']),
        'ADVP': set(['RB', 'RBR', 'RBS']),
        'NP': set(['NN', 'NNS', 'NNP', 'NNPS']),
        'VP': set(['VB', 'VBP', 'VBZ', 'VBD', 'VBZ', 'VBN']),
    }
    
    def _transform_constituent(self, constit):
        phrase_type = constit.cat
        sub_units = constit.sub_units
        num_sub_units = len(sub_units)
        if len(constit.sub_units) <= 2 or phrase_type not in self.phrase_head_mapping:
            return
        poss_head_types = self.phrase_head_mapping[phrase_type] | set([phrase_type])
        poss_head_idxs = [idx 
                for (idx, su) in enumerate(sub_units) 
                if su.cat in poss_head_types]
        if len(poss_head_idxs) != 1:
            # 0 oer > 1 possible heads - return as we can't binarize
            return
        head_idx =  poss_head_idxs[0]
        
        #keep the nearest immediate head sibling, and binarise the rest
        #successively, preserving the phrase type
        #in english this will usually be right-branching
        if head_idx == 0:
            branch_right = True
        elif head_idx == num_sub_units - 1:
            branch_right = False
        else:
            #if it's not at an extreme, we can't work out the binarization
            # canonically. So be conservative and do nothing
            return
        
        if branch_right:
            new_constit = TreeConstituent(phrase_type, sub_units[:2])
            for i in range(2, num_sub_units - 1):
                new_constit = TreeConstituent(phrase_type, [new_constit] + [sub_units[i]])
            constit.replace_sub_units([new_constit] + [sub_units[-1]])
        else:
            new_constit = TreeConstituent(phrase_type, sub_units[-2:])
            for i in range(num_sub_units - 2, 1, -1):
                new_constit = TreeConstituent(phrase_type, [sub_units[i]] + [new_constit])
            constit.replace_sub_units([sub_units[0]] + [new_constit])


class RaiseNPPremodTransformer(RecursiveTransformer):
    """ rearrange NPs which have an attachments both
        before and after the head N, so the left
        attachments attach higher than the right ones
        (which follows ERG convention)    
    """
    target_cats = None
    
    def _transform_constituent(self, constit):
        phrase_type = constit.cat
        if phrase_type != 'NP' or len(constit.sub_constituents) < 2:
            return
        first_subcons = constit.sub_constituents[0]
        if first_subcons.cat == 'NP' and len(first_subcons.sub_units) > 1:
            # rearrange the NP if the first consitit is an NP with a left most member 
            # which is in self.target_cats
            leftmost = first_subcons.sub_units[0]
            if leftmost.cat in self.target_cats:
                #create a new constituent for the RHS with every thing but the first subunit
                new_right = TreeConstituent('NP', first_subcons.sub_units[1:] + constit.sub_units[1:])
                constit.replace_sub_units([leftmost] + [new_right])

class RaiseJJsTransformer(RaiseNPPremodTransformer):
    target_cats = set(['JJ', 'JJR', 'JJS', 'ADJP'])

class RaiseDetsTransformer(RaiseNPPremodTransformer):
    target_cats = set(['DT'])

class CombinedTransformer(object):
    def __init__(self):
        self.intern_transformers = []
    
    def transform(self, tree):
        for tf in self.intern_transformers:
            tf.transform(tree)

class ERGMatchGTBTreeTransformer(CombinedTransformer):
    """Applies some transformations to the GTB tree to account for some systematic differences 
    with the ERG"""
    def __init__(self, raise_dets=True, raise_jjs=True, binarize_phrases=True):
        """Initialise with the basic options. Set `raise_dets` to raise the determiner to 
        top left of an NP. Set `raise_adjs` to do the same thing with adjectives.
        Set `binarize_phrases` to make sure that nodes (eg NPs) have at most one sibling"""
        super(ERGMatchGTBTreeTransformer, self).__init__()
        self.binarize_phrases = binarize_phrases
        self.raise_dets = raise_dets
        self.raise_jjs = raise_jjs
        if self.binarize_phrases:
            self.intern_transformers.append(BinarizePhraseTransformer())
        if self.raise_dets:
            self.intern_transformers.append(RaiseDetsTransformer())
        if self.raise_jjs:
            self.intern_transformers.append(RaiseJJsTransformer())
        

class IncompatibleGoldDiscriminant(Exception):
    pass

class DiscriminantFilterResponse(object):
    def __init__(self, discriminant_group):
        self.discriminants_by_index = dict((disc.index, disc) for disc in discriminant_group.discriminants)
        self.rejected_indexes = set()
        self.required_indexes = set()
        self.rejection_reasons = defaultdict(list)
        self.requirement_reasons = defaultdict(list)
    
    def by_index(self):
        responses = {}
        for idx in self.discriminants_by_index:
            if idx in self.rejected_indexes:
                response = '-'
            elif idx in self.required_indexes:
                response = '+'
            else:
                response = '?'
            responses[idx] = response
        return responses
    
    def sequential(self):
        result = [resp for _, resp in sorted(self.by_index().items())]
        return result
    
    def verbose(self, include_reasons=True, include_unknown=False):
        output = [];
        for idx in sorted(self.rejected_indexes):
            reasons = u' for ' + u', '.join(repr(r) for r in self.rejection_reasons[idx]) \
                    if include_reasons else ''
            output.append(u"REJ: %r" % self.discriminants_by_index[idx] + reasons)
        for idx in sorted(self.required_indexes):
            reasons = u' for ' + u', '.join(repr(r) for r in self.requirement_reasons[idx]) \
                    if include_reasons else ''
            output.append(u"AFF: %r" % self.discriminants_by_index[idx] + reasons)
        if include_unknown:
            for idx in sorted(self.unknown_indexes):
                output.append(u"UNK: %r" % self.discriminants_by_index[idx])
        return u"\n".join(output)
    
    def reject(self, discriminant, reason):
        if discriminant.index in self.required_indexes:
            raise ConflictingConditionsException("Discriminant at index %d was required then rejected"
                    % discriminant.index)
        self.rejected_indexes.add(discriminant.index)
        self.rejection_reasons[discriminant.index].append(reason)
    
    def require(self, discriminant, reason=None):
        if discriminant.index in self.rejected_indexes:
            raise ConflictingConditionsException("Discriminant at index %d was rejected then required"
                    % discriminant.index)
        self.required_indexes.add(discriminant.index)
        self.requirement_reasons[discriminant.index].append(reason)
        
    @property
    def unknown_indexes(self):
        return set(self.discriminants_by_index.iterkeys()) - \
                self.rejected_indexes - self.required_indexes

class ConflictingConditionsException(Exception):
    """
    Indicates that conflicting conditions have been specified.
    """
    pass

class DiscriminantGroup(object):
    def __init__(self, raw_text, discriminant_xml_list):
        self.raw_text = raw_text
        self.discriminants = [Discriminant(idx, dxml, self.raw_text) 
                for (idx, dxml) in enumerate(discriminant_xml_list)]
    
    def __repr__(self):
        return "DiscriminantGroup(%r, %r)" % (self.raw_text, self.discriminants)

class Discriminant(object):
    def __init__(self, disc_idx, xml, all_text):
        self.index = disc_idx
        root = ET.fromstring(xml)
        self.supplied_id = root.get('id') # for debugging only at this stage
        self.type = root.get('type')
        self.key = root.get('key')
        self.cfrom = int(root.get('from'))
        self.cto = int(root.get('to'))
        self.representation = root.text # should probably not use this, except for maybe debugging
        self.all_text = all_text
    
    def raw_text(self):
        return self.all_text[self.cfrom:self.cto]
    
    def __repr__(self):
        return u'Discriminant(index=%(index)r, id=%(supplied_id)r, type=%(type)r, '\
            'key=%(key)r, from=%(cfrom)r, to=%(cto)r, repr=%(representation)r)' %\
            self.__dict__


class GTBTree(object):
    def __init__(self, raw_xml):
        root_elem = ET.fromstring(raw_xml)
        cons_elems = root_elem.findall('./cons')
        assert len(cons_elems) == 1, "Found invalid number of root constituents"
        root_cons_elem = cons_elems[0]
        self.root_constituent = GTBConstit(root_cons_elem, 0)
    
    def all_constituents(self):
        return self.root_constituent.all_sub_constituents

    def all_paths(self):
        return self.root_constituent.all_sub_paths
    
    def all_tokens(self):
        return self.root_constituent.tokens
    
    def __unicode__(self):
        return unicode(self.root_constituent)
    
    def get_text(self):
        # urg - this may not work if we *do* have multiple root constituents - maybe should just disallow
        # them (if they in fact occur)?
        return self.root_constituent.get_text()

class TreeConstituent(object):
    kind = 'cons'
    
    def __init__(self, cat, sub_units):
        self.cat = cat
        self.replace_sub_units(sub_units)
    
    @property
    def sub_units(self):
        return self._sub_units
        
    @property
    def sub_constituents(self):
        return self._sub_constituents
    
    @property
    def all_sub_constituents(self):
        all_subs = [self]
        for cons in self.sub_constituents:
            all_subs.extend(cons.all_sub_constituents)
        return all_subs
    
    @property
    def all_sub_paths(self):
        """The list of all (bottom-to-top) paths through this subtree.
        
        each element in the returned list is a tuple going from a 
        constituent at some level tracing upwards through the tree, so the second
        element is the parent, the third the grandparent (if it exists) etc, and
        [sp[0] for sp in tree.all_sub_paths()] is equivalent to tree.all_sub_constituents()"""
        all_subs = [(self,)]
        for cons in self.sub_constituents:
            for sub_path in cons.all_sub_paths:
                all_subs.append(sub_path + (self,))
        return all_subs
    
    @property
    def tokens(self):
        all_tokens = []
        for cons in self.sub_units:
            try:
                tokens = cons.tokens
            except AttributeError:
                tokens = [cons] # if it is actually a token itself
            all_tokens.extend(tokens)
        return all_tokens
    
    def replace_sub_units(self, new_sub_units):
        self._sub_constituents = [su for su in new_sub_units if su.kind == 'cons']
        self._sub_units = new_sub_units[:] #do we need to copy here?
    
    def __unicode__(self):
        return u'(%s %s)' % (self.cat, u' '.join(unicode(u) for u in self.sub_units))
    
    def __repr__(self):
        return 'TreeConstituent(%r, %r)' % (self.cat, self.sub_units)
    
    def get_text(self, start_char_idx=0):
        text_comps = []
        char_idx = start_char_idx
        sub_cons = None
        for sub_cons in self.sub_units:
            new_char_start = sub_cons.char_start if sub_cons.char_start is not None else char_idx
            if sub_cons.kind == 'tok':
                text_comps.append(u' ' * (new_char_start - char_idx))
                # (writing in the amount of whitespace which we can calc from char spans)
                text_comps.append(sub_cons.text)
            else:
                text_comps.append(sub_cons.get_text(char_idx))
            if sub_cons.char_end is not None: # don't change for null constituents
                char_idx = sub_cons.char_end
        # DO we need to handle the findl token suffix here? possibly not even picking it up, so 
        # irrelevant in any case
        return u''.join(text_comps)
    
    @property
    def char_start(self):
        return self.tokens[0].char_start if self.tokens else None
    
    @property
    def char_end(self):
        return self.tokens[-1].char_end if self.tokens else None
        

class GTBConstit(TreeConstituent):
    def __init__(self, cons_elem, char_start=0):
        self.cat = cons_elem.get('cat')
        sub_char_start = char_start
        new_subs = []
        for sub_elem in cons_elem:
            if sub_elem.tag == 'cons':
                new_sub = GTBConstit(sub_elem, sub_char_start)
            elif sub_elem.tag == 'tok':
                new_sub = GTBToken(sub_elem, sub_char_start)
            new_subs.append(new_sub)
            sub_char_start = new_sub.char_end + len(sub_elem.tail or '')
        if not new_subs:# empty constituent? add a null token for ease of char tracking
            new_subs.append(NullToken(char_start))
        self.replace_sub_units(new_subs)
    

class GTBToken(object):
    kind = 'tok'
    
    def __init__(self, tok_elem, char_start=0):
        self.cat = tok_elem.get('cat')
        self.text = tok_elem.text
        self._char_start = char_start
    
    @property
    def char_start(self):
        return self._char_start
    
    @property
    def char_end(self):
        return self._char_start + len(self.text)

    def __unicode__(self):
        return u'%s/%s[%d,%d]' % (self.text, self.cat, self.char_start, self.char_end)
    
    def __repr__(self):
        return 'Token(%r, %r, %d, %d)' % (self.cat, self.text, self.char_start, self.char_end)

class NullToken(object):
    """zero-width empty token
    
    Makes tracking characterization easier.
    """
    kind = 'tok'
    
    def __init__(self, char_start):
        self._char_start = char_start
        self.cat = None
        self.text = ''
    
    @property
    def char_start(self):
        return self._char_start
    
    @property
    def char_end(self):
        return self._char_start
    
    def __unicode__(self):
        return u''
    
    def __repr__(self):
        return 'NullToken(%d)' % self.char_start


class UnknownAggressionLevel(Exception):
    pass

class InputDumpDirNotWriteable(Exception):
    pass

class GTBTreeBlazingHandler(object):
    def __init__(self, aggression_levels=(20,), input_dump_dir=None):
        self.blazers = []
        for lvl in aggression_levels:
            if lvl == 50:
                self.blazers.append(GTBTreeBlazer(match_le_types=True,
                        transformer=ERGMatchGTBTreeTransformer(), 
                        ignore_children_matching_parents=False))
            elif lvl == 40:
                self.blazers.append(GTBTreeBlazer(match_le_types=False,
                        transformer=ERGMatchGTBTreeTransformer(), 
                        ignore_children_matching_parents=False))
            elif lvl == 20:
                self.blazers.append(GTBTreeBlazer(match_le_types=False,
                        transformer=None, ignore_children_matching_parents=True))
            else:
                raise UnknownAggressionLevel("Don't know about aggression level %r" % lvl)
        self.input_dump_dir = input_dump_dir
        try:
            test_fname = path.join(self.input_dump_dir, 'TEST')
            with open(test_fname, 'w'):
                pass
            os.unlink(test_fname)
        except (IOError, OSError), e:
            raise InputDumpDirNotWriteable("Can't write to directory '%s'" % self.input_dump_dir)            
    
    def treeblaze(self, item_id, call_sequence, item_input, discriminants):
        encoding = 'utf-8'
        if call_sequence >= len(self.blazers): #fallback, currently to all trees after we've run 
                # out of ways to selectively prune
            returnable = ['?' for d in discriminants]
        else:
            blazer = self.blazers[call_sequence]
            tree = GTBTree(item_input)
            discriminant_group = DiscriminantGroup(tree.get_text(), discriminants)
            response = blazer.filter(tree, discriminant_group)
            returnable = response.sequential()
        assert len(returnable) == len(discriminants)
        if self.input_dump_dir:
            fname_stem = path.join(self.input_dump_dir, str(item_id))
            item_fname = fname_stem + '.input'
            disc_fname = fname_stem + '.discriminants'
            response_fname = fname_stem + '.%02d.response' % call_sequence
            with nested(open(item_fname, 'w'), 
                    open(disc_fname, 'w'), 
                    open(response_fname, 'w')) as (item_f, disc_f, resp_f):
                item_f.write(item_input.encode(encoding) + '\n')
                for disc in discriminants:
                    disc_f.write(disc.encode(encoding) + '\n')
                resp_f.write(datetime.now().isoformat() + '\n')
                for val in returnable:
                    resp_f.write(val.encode(encoding) + '\n')
        return returnable

def run_xmlrpc_server(instance, host='localhost', port=8000):
    server = SimpleXMLRPCServer((host, port), encoding='utf-8')
    server.register_introspection_functions()
    server.register_instance(instance)
    print "initializing XMLRPC server for %r on %s:%d" % (instance, host, port)
    server.serve_forever()
        

def file_to_blazing_comps(fname):
    """throwaway for debugging - usually do this from XMLRPC.""" 
    with open(fname) as f:
        lines = [line.decode('utf-8').rstrip('\n') for line in f.readlines()]
    ext_tree = GTBTree(lines[0])
    raw_text = ext_tree.get_text()
    discriminant_group = DiscriminantGroup(raw_text, lines[1:])
    return (ext_tree, discriminant_group)

def test_direct_from_files(fnames, levels=(50, 40, 20)):
    test_from_files(fnames, GTBTreeBlazingHandler(levels), len(levels))

def test_xmlrpc_from_files(fnames, max_seq=2, host='localhost', port=8000):
    sproxy = ServerProxy('http://%s:%d/' % (host, port))
    test_from_files(fnames, sproxy, max_seq)

def test_from_files(fnames, blazing_handler, max_seq=1):
    for fname in fnames:
        with open(fname) as f:
            lines = [line.decode('utf-8').rstrip('\n') for line in f.readlines()]
        fname_base = path.basename(fname)
        item_id, _ = path.splitext(fname_base)
        item_id = int(item_id)
        item_input = lines[0]
        discriminants = lines[1:]
        print "item_id=%d, item_input=%r, discriminants=[..%d discriminants..]" % (
                item_id, item_input, len(discriminants))
        for seq in range(max_seq + 1):
            result = blazing_handler.treeblaze(item_id, seq, item_input, discriminants)
            print "seq=%d, rejected %d: %r" % (seq, sum(1 for r in result if r == '-'), result)


# ALSO allow to be run as an executable for easy distribution to third parties
import optparse

def main():
    parser = optparse.OptionParser("Usage: %prog [options]")
    parser.add_option('-H', '--host', action='store', dest='host', default='localhost',
            help="Host name or address to listen on (default: %default)")
    parser.add_option('-p', '--port', action='store', dest='port', type='int', default=8000,
            help="Port to listen on (default: %default)")
    parser.add_option('-L', '--agg-level', action='append', type='int', dest='levels',
            help="Add the supplied level to the the list of aggression levels that will be"
            " attempted (default: [20] if none are supplied). Level 20 means avoid"
            " matching constituents where the parent has the same label. Level 40 means"
            " to rewrite GTB trees using some heuristics, and try and match all nodes."
            " Level 50 means the same as level 40, but also to attempt to reject or"
            " match discriminants at the type level using some hastily-constructed rules."
            " If multiple values are supplied, they will be tried sequentially on"
            " successive calls to the blazing module. Should usually be in"
            " descending numeric order. There is always an unrestricted fallback")
    parser.add_option('--no-dump', action='store_false', dest='dump_input', default=True,
            help="Don't dump input to a user-supplied or auto-selected directory")
    parser.add_option('--dump-dir', action='store', type='string', default=None, dest='dump_dir')
    options, _ = parser.parse_args()
    levels = options.levels if options.levels else [20]
    if tuple(levels) != tuple(sorted(levels, reverse=True)):
        print "WARNING: list of levels %r is not in descending order. " % levels
    if options.dump_input and options.dump_dir is None:
        options.dump_dir = path.join(gettempdir(), 'treeblazing-%s' % datetime.now().strftime("%Y%m%d%H%M%S"))
        os.mkdir(options.dump_dir)
    print "Dumping data to '%s'" % options.dump_dir
    run_xmlrpc_server(GTBTreeBlazingHandler(levels, options.dump_dir), options.host, options.port)
    
if __name__ == "__main__":
    main()