#!/usr/bin/env python # Compatible with Python 2.7 and 3.2+, can be used either as a module # or a standalone executable. # # Copyright 2017, 2018 Institute of Formal and Applied Linguistics (UFAL), # Faculty of Mathematics and Physics, Charles University, Czech Republic. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # # Authors: Milan Straka, Martin Popel # # Changelog: # - [12 Apr 2018] Version 0.9: Initial release. # - [19 Apr 2018] Version 1.0: Fix bug in MLAS (duplicate entries in functional_children). # Add --counts option. # - [02 May 2018] Version 1.1: When removing spaces to match gold and system characters, # consider all Unicode characters of category Zs instead of # just ASCII space. # Command line usage # ------------------ # conll18_ud_eval.py [-v] gold_conllu_file system_conllu_file # # - if no -v is given, only the official CoNLL18 UD Shared Task evaluation metrics # are printed # - if -v is given, more metrics are printed (as precision, recall, F1 score, # and in case the metric is computed on aligned words also accuracy on these): # - Tokens: how well do the gold tokens match system tokens # - Sentences: how well do the gold sentences match system sentences # - Words: how well can the gold words be aligned to system words # - UPOS: using aligned words, how well does UPOS match # - XPOS: using aligned words, how well does XPOS match # - UFeats: using aligned words, how well does universal FEATS match # - AllTags: using aligned words, how well does UPOS+XPOS+FEATS match # - Lemmas: using aligned words, how well does LEMMA match # - UAS: using aligned words, how well does HEAD match # - LAS: using aligned words, how well does HEAD+DEPREL(ignoring subtypes) match # - CLAS: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes) match # - MLAS: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes)+UPOS+UFEATS+FunctionalChildren(DEPREL+UPOS+UFEATS) match # - BLEX: using aligned words with content DEPREL, how well does # HEAD+DEPREL(ignoring subtypes)+LEMMAS match # - if -c is given, raw counts of correct/gold_total/system_total/aligned words are printed # instead of precision/recall/F1/AlignedAccuracy for all metrics. # API usage # --------- # - load_conllu(file) # - loads CoNLL-U file from given file object to an internal representation # - the file object should return str in both Python 2 and Python 3 # - raises UDError exception if the given file cannot be loaded # - evaluate(gold_ud, system_ud) # - evaluate the given gold and system CoNLL-U files (loaded with load_conllu) # - raises UDError if the concatenated tokens of gold and system file do not match # - returns a dictionary with the metrics described above, each metric having # three fields: precision, recall and f1 # Description of token matching # ----------------------------- # In order to match tokens of gold file and system file, we consider the text # resulting from concatenation of gold tokens and text resulting from # concatenation of system tokens. These texts should match -- if they do not, # the evaluation fails. # # If the texts do match, every token is represented as a range in this original # text, and tokens are equal only if their range is the same. # Description of word matching # ---------------------------- # When matching words of gold file and system file, we first match the tokens. # The words which are also tokens are matched as tokens, but words in multi-word # tokens have to be handled differently. # # To handle multi-word tokens, we start by finding "multi-word spans". # Multi-word span is a span in the original text such that # - it contains at least one multi-word token # - all multi-word tokens in the span (considering both gold and system ones) # are completely inside the span (i.e., they do not "stick out") # - the multi-word span is as small as possible # # For every multi-word span, we align the gold and system words completely # inside this span using LCS on their FORMs. The words not intersecting # (even partially) any multi-word span are then aligned as tokens. from __future__ import division from __future__ import print_function import argparse import io import sys import unicodedata import unittest import json from os import path, stat # CoNLL-U column names ID, FORM, LEMMA, UPOS, XPOS, FEATS, HEAD, DEPREL, DEPS, MISC = range(10) # Content and functional relations CONTENT_DEPRELS = { "nsubj", "obj", "iobj", "csubj", "ccomp", "xcomp", "obl", "vocative", "expl", "dislocated", "advcl", "advmod", "discourse", "nmod", "appos", "nummod", "acl", "amod", "conj", "fixed", "flat", "compound", "list", "parataxis", "orphan", "goeswith", "reparandum", "root", "dep" } FUNCTIONAL_DEPRELS = { "aux", "cop", "mark", "det", "clf", "case", "cc" } UNIVERSAL_FEATURES = { "PronType", "NumType", "Poss", "Reflex", "Foreign", "Abbr", "Gender", "Animacy", "Number", "Case", "Definite", "Degree", "VerbForm", "Mood", "Tense", "Aspect", "Voice", "Evident", "Polarity", "Person", "Polite" } TRIAL_KEYS = {} TRIAL_KEYS['events'] = {'00300193': 'training/8454603.txt', '00300192': 'training/9442395.txt', '00300191': 'training/9843840.txt', '00300190': 'training/9343406.txt', '00300197': 'training/1419905.txt', '00300196': 'training/10229820.txt', '00300195': 'training/2109187.txt', '00300194': 'training/10221643.txt', '003001223': 'training/9243748.txt', '003001388': 'training/7594456.txt', '003001283': 'training/9277478.txt', '003001282': 'training/2304473.txt', '003001281': 'training/2394747.txt', '003001280': 'training/8816436.txt', '003001287': 'training/8662845.txt', '003001286': 'training/9835626.txt', '003001285': 'training/9311830.txt', '003001284': 'training/10225377.txt', '003001320': 'training/10200294.txt', '003001321': 'training/1545132.txt', '003001322': 'training/8871649.txt', '003001323': 'training/8298127.txt', '003001324': 'training/8634413.txt', '003001325': 'training/7682243.txt', '003001326': 'training/10064103.txt', '003001327': 'training/1896645.txt', '00300175': 'training/7609053.txt', '003001100': 'training/8067997.txt', '003001268': 'training/9783909.txt', '003001102': 'training/8947512.txt', '003001103': 'training/1676267.txt', '003001104': 'training/8566023.txt', '003001105': 'training/1653056.txt', '003001106': 'training/9306134.txt', '003001107': 'training/9032344.txt', '003001108': 'training/8419337.txt', '003001109': 'training/9765295.txt', '003001263': 'training/9852211.txt', '003001262': 'training/10318942.txt', '003001265': 'training/7565732.txt', '003001264': 'training/7969146.txt', '003001267': 'training/2017258.txt', '003001266': 'training/2065663.txt', '00300144': 'training/9111040.txt', '00300145': 'training/9015187.txt', '00300146': 'training/10347175.txt', '00300147': 'training/1981844.txt', '00300140': 'training/7594540.txt', '00300141': 'training/1532661.txt', '00300142': 'training/10377075.txt', '00300143': 'training/9211847.txt', '00300148': 'training/7590666.txt', '00300149': 'training/9633826.txt', '00300177': 'training/7823943.txt', '003001197': 'training/9819151.txt', '003001196': 'training/7961690.txt', '003001195': 'training/9060666.txt', '003001194': 'training/9686612.txt', '003001193': 'training/9710582.txt', '003001219': 'training/8543789.txt', '003001191': 'training/10430908.txt', '003001190': 'training/2121746.txt', '003001214': 'training/1372388.txt', '003001215': 'training/7670114.txt', '003001216': 'training/8144878.txt', '003001217': 'training/2006151.txt', '003001210': 'training/9715838.txt', '003001211': 'training/1620119.txt', '003001199': 'training/9309306.txt', '003001198': 'training/7659529.txt', '00300176': 'training/10203577.txt', '00300171': 'training/1448931.txt', '003001359': 'training/1907460.txt', '003001291': 'training/10202027.txt', '0030028': 'development/8617979.txt', '0030029': 'development/10395671.txt', '0030026': 'development/10395652.txt', '0030027': 'development/8622883.txt', '0030024': 'development/8690900.txt', '0030025': 'development/9356353.txt', '0030022': 'development/9032466.txt', '0030023': 'development/8106512.txt', '0030020': 'development/9808189.txt', '0030021': 'development/8096091.txt', '003001298': 'training/10364260.txt', '003001299': 'training/2144551.txt', '003001364': 'training/7929104.txt', '003001365': 'training/8663022.txt', '003001366': 'training/8189531.txt', '003001367': 'training/10359895.txt', '003001360': 'training/8207643.txt', '003001361': 'training/8973354.txt', '003001362': 'training/9862666.txt', '003001363': 'training/9915779.txt', '003001405': 'training/8628274.txt', '003001404': 'training/9464836.txt', '003001407': 'training/7859290.txt', '003001406': 'training/9224203.txt', '003001401': 'training/9095577.txt', '003001400': 'training/2072454.txt', '003001403': 'training/2006423.txt', '003001402': 'training/8605348.txt', '00300249': 'development/9825820.txt', '00300248': 'development/7495759.txt', '00300247': 'development/9166418.txt', '00300246': 'development/8898960.txt', '00300245': 'development/9353251.txt', '00300244': 'development/8621773.txt', '00300243': 'development/9852070.txt', '00300242': 'development/7499267.txt', '00300241': 'development/1719551.txt', '00300240': 'development/10415075.txt', '003001225': 'training/9974401.txt', '003001224': 'training/8725939.txt', '003001227': 'training/9570512.txt', '00300183': 'training/9952372.txt', '003001221': 'training/10430922.txt', '00300185': 'training/8761381.txt', '00300186': 'training/9013959.txt', '00300187': 'training/1851743.txt', '003001382': 'training/9242564.txt', '003001383': 'training/1956769.txt', '003001380': 'training/8663174.txt', '003001381': 'training/10455134.txt', '003001386': 'training/7869038.txt', '003001387': 'training/9119999.txt', '003001384': 'training/8839844.txt', '003001385': 'training/10229841.txt', '003001409': 'training/8018594.txt', '003001408': 'training/7923175.txt', '003001311': 'training/10082473.txt', '003001310': 'training/10329845.txt', '003001313': 'training/8018558.txt', '003001312': 'training/8135784.txt', '003001315': 'training/9442380.txt', '003001314': 'training/7986199.txt', '003001317': 'training/9209438.txt', '003001316': 'training/9730957.txt', '003001319': 'training/9438495.txt', '003001318': 'training/9031085.txt', '003001139': 'training/9697844.txt', '003001138': 'training/8804437.txt', '003001131': 'training/7578980.txt', '003001130': 'training/10329626.txt', '003001133': 'training/7522304.txt', '003001132': 'training/7569976.txt', '003001135': 'training/9187264.txt', '003001134': 'training/2172166.txt', '003001137': 'training/10438457.txt', '003001136': 'training/9032280.txt', '00300119': 'training/8652841.txt', '00300118': 'training/8602529.txt', '00300113': 'training/8441379.txt', '00300112': 'training/7645208.txt', '00300111': 'training/8843413.txt', '00300110': 'training/10349513.txt', '00300117': 'training/9858618.txt', '00300116': 'training/9052839.txt', '00300115': 'training/9874515.txt', '00300114': 'training/8797708.txt', '003001148': 'training/9322967.txt', '003001149': 'training/10327050.txt', '003001144': 'training/10069412.txt', '003001145': 'training/9073544.txt', '003001146': 'training/9888865.txt', '003001147': 'training/10438913.txt', '003001140': 'training/9817603.txt', '003001141': 'training/7720085.txt', '003001142': 'training/8871623.txt', '003001143': 'training/10331989.txt', '00300174': 'training/9733716.txt', '003001276': 'training/9195127.txt', '003001277': 'training/8702849.txt', '00300168': 'training/1896644.txt', '00300236': 'development/9794238.txt', '00300237': 'development/10229275.txt', '00300234': 'development/8925904.txt', '00300235': 'development/7506531.txt', '00300232': 'development/9507734.txt', '00300233': 'development/7747440.txt', '00300230': 'development/9356494.txt', '00300231': 'development/10411007.txt', '00300182': 'training/9011569.txt', '00300238': 'development/10403270.txt', '00300239': 'development/10415609.txt', '003001389': 'training/9028949.txt', '00300184': 'training/9710600.txt', '003001220': 'training/8314792.txt', '0030017': 'training/9634075.txt', '0030016': 'training/7888116.txt', '0030015': 'training/7742037.txt', '0030014': 'training/10022897.txt', '0030013': 'training/9722600.txt', '0030012': 'training/7629508.txt', '0030011': 'training/2023633.txt', '0030010': 'training/9310836.txt', '003001222': 'training/9219058.txt', '0030019': 'training/9427533.txt', '0030018': 'training/7864072.txt', '00300188': 'training/9400372.txt', '00300189': 'training/7543515.txt', '003001368': 'training/8664547.txt', '003001229': 'training/8468462.txt', '003001228': 'training/8052854.txt', '003001369': 'training/8943338.txt', '003001294': 'training/7589085.txt', '003001295': 'training/10378895.txt', '003001296': 'training/9047239.txt', '003001297': 'training/9872676.txt', '003001290': 'training/10364157.txt', '003001358': 'training/9765256.txt', '003001292': 'training/7623828.txt', '003001293': 'training/7526398.txt', '003001355': 'training/1653950.txt', '003001354': 'training/8605359.txt', '003001357': 'training/7664781.txt', '003001356': 'training/10233882.txt', '003001351': 'training/2302185.txt', '003001350': 'training/10087993.txt', '003001353': 'training/8691123.txt', '003001352': 'training/8691127.txt', '003001258': 'training/9224625.txt', '003001175': 'training/1944294.txt', '003001174': 'training/1493333.txt', '003001177': 'training/9428992.txt', '003001176': 'training/10228026.txt', '003001171': 'training/9136080.txt', '003001170': 'training/8183915.txt', '003001173': 'training/1502179.txt', '003001172': 'training/1375324.txt', '003001272': 'training/9823774.txt', '003001273': 'training/9115810.txt', '003001270': 'training/7862168.txt', '003001271': 'training/7884865.txt', '003001179': 'training/1945879.txt', '003001178': 'training/9285527.txt', '003001274': 'training/10330274.txt', '003001275': 'training/8985116.txt', '00300157': 'training/7843230.txt', '00300156': 'training/1505523.txt', '00300155': 'training/7878466.txt', '00300154': 'training/10364191.txt', '00300153': 'training/1314139.txt', '00300152': 'training/8631821.txt', '00300151': 'training/9247567.txt', '00300150': 'training/1946405.txt', '00300159': 'training/7989745.txt', '00300158': 'training/2105946.txt', '003001188': 'training/7640302.txt', '003001189': 'training/8562512.txt', '003001180': 'training/9893043.txt', '003001181': 'training/9852310.txt', '003001182': 'training/10037751.txt', '003001183': 'training/8816450.txt', '003001184': 'training/8645254.txt', '003001185': 'training/9144472.txt', '003001186': 'training/8513868.txt', '003001187': 'training/8799177.txt', '003001328': 'training/8137243.txt', '003001329': 'training/10233875.txt', '00300164': 'training/9130512.txt', '003001288': 'training/9191057.txt', '00300272': 'development/9796963.txt', '00300273': 'development/1335418.txt', '00300270': 'development/9233628.txt', '00300271': 'development/7537762.txt', '00300276': 'development/9164948.txt', '00300277': 'development/9043959.txt', '00300274': 'development/10455128.txt', '00300275': 'development/9372447.txt', '00300278': 'development/10092805.txt', '00300279': 'development/8892614.txt', '003001289': 'training/2105887.txt', '003001236': 'training/9343210.txt', '003001237': 'training/9299589.txt', '003001234': 'training/8562886.txt', '003001235': 'training/7917514.txt', '003001232': 'training/7853483.txt', '003001233': 'training/7641319.txt', '003001230': 'training/7983701.txt', '003001231': 'training/8523529.txt', '00300199': 'training/9341756.txt', '00300198': 'training/8139041.txt', '003001238': 'training/9178107.txt', '003001239': 'training/1531086.txt', '003001302': 'training/9188651.txt', '003001303': 'training/1987353.txt', '003001300': 'training/9032264.txt', '003001301': 'training/8493578.txt', '003001306': 'training/9032403.txt', '003001307': 'training/2148290.txt', '003001304': 'training/9442374.txt', '003001305': 'training/9725220.txt', '003001308': 'training/10029589.txt', '003001309': 'training/10228008.txt', '003001128': 'training/9440546.txt', '003001129': 'training/10383397.txt', '003001122': 'training/9144218.txt', '003001123': 'training/7739562.txt', '003001120': 'training/2407588.txt', '003001121': 'training/9271588.txt', '003001126': 'training/1953785.txt', '003001127': 'training/8645086.txt', '003001124': 'training/2123553.txt', '003001125': 'training/7657825.txt', '00300128': 'training/9756417.txt', '00300129': 'training/7931077.txt', '00300126': 'training/1931834.txt', '00300127': 'training/8816424.txt', '00300124': 'training/2026605.txt', '00300125': 'training/8480425.txt', '00300122': 'training/9007200.txt', '00300123': 'training/1537389.txt', '00300120': 'training/9109677.txt', '00300121': 'training/9341877.txt', '003001269': 'training/8663230.txt', '003001101': 'training/7541794.txt', '003001261': 'training/2258623.txt', '003001260': 'training/9052735.txt', '003001250': 'training/7969177.txt', '003001251': 'training/9209268.txt', '003001254': 'training/7890658.txt', '003001255': 'training/9916709.txt', '003001348': 'training/9712068.txt', '003001349': 'training/9312192.txt', '003001346': 'training/9442400.txt', '003001347': 'training/8479911.txt', '003001344': 'training/8054477.txt', '003001345': 'training/1655897.txt', '003001342': 'training/8079992.txt', '003001343': 'training/10080532.txt', '003001340': 'training/9376579.txt', '003001341': 'training/7665588.txt', '00300229': 'development/7486667.txt', '00300228': 'development/9154298.txt', '003001259': 'training/7525701.txt', '00300221': 'development/9299399.txt', '00300220': 'development/8896456.txt', '00300223': 'development/7505113.txt', '00300222': 'development/8887687.txt', '00300225': 'development/9351352.txt', '00300224': 'development/10395645.txt', '00300227': 'development/8621538.txt', '00300226': 'development/7510691.txt', '003001166': 'training/8796372.txt', '003001167': 'training/7788861.txt', '003001164': 'training/7545467.txt', '00300169': 'training/10377411.txt', '003001162': 'training/7713868.txt', '003001163': 'training/7862157.txt', '003001249': 'training/9144479.txt', '003001161': 'training/9838061.txt', '00300162': 'training/1618911.txt', '003001246': 'training/9341193.txt', '003001245': 'training/7949138.txt', '003001244': 'training/8872606.txt', '00300166': 'training/8524816.txt', '00300167': 'training/9919536.txt', '003001241': 'training/9840924.txt', '00300170': 'training/7849291.txt', '00300180': 'training/1768652.txt', '00300173': 'training/9101089.txt', '00300172': 'training/8151786.txt', '00300181': 'training/8739563.txt', '003001169': 'training/2122173.txt', '003001218': 'training/2204723.txt', '003001192': 'training/10329958.txt', '003001212': 'training/1829648.txt', '003001213': 'training/8573121.txt', '00300210': 'development/7499266.txt', '00300211': 'development/9878621.txt', '00300212': 'development/9794375.txt', '00300213': 'development/9488049.txt', '00300214': 'development/9372912.txt', '00300215': 'development/9170401.txt', '00300216': 'development/9361029.txt', '00300217': 'development/8108127.txt', '00300218': 'development/7759875.txt', '00300219': 'development/10092801.txt', '00300265': 'development/9794389.txt', '00300264': 'development/7759956.txt', '00300267': 'development/7760810.txt', '00300266': 'development/10090947.txt', '00300261': 'development/9171108.txt', '00300260': 'development/8134378.txt', '00300263': 'development/8675228.txt', '00300262': 'development/8621390.txt', '00300269': 'development/9802971.txt', '00300268': 'development/7575565.txt', '00300282': 'development/9989503.txt', '00300281': 'development/1675604.txt', '00300280': 'development/7769834.txt', '003001337': 'training/10485906.txt', '003001336': 'training/9317131.txt', '003001335': 'training/1946356.txt', '003001334': 'training/7826623.txt', '003001333': 'training/7534663.txt', '003001332': 'training/9741337.txt', '003001331': 'training/9748323.txt', '003001330': 'training/10221658.txt', '003001339': 'training/10328874.txt', '003001338': 'training/7843251.txt', '003001119': 'training/8083467.txt', '003001118': 'training/2237444.txt', '003001117': 'training/7524762.txt', '003001116': 'training/8641467.txt', '003001115': 'training/1533884.txt', '003001114': 'training/9182556.txt', '003001113': 'training/7522257.txt', '003001112': 'training/9045614.txt', '003001111': 'training/7964616.txt', '003001110': 'training/10352258.txt', '00300131': 'training/1315834.txt', '00300130': 'training/9009221.txt', '00300133': 'training/8670269.txt', '00300132': 'training/10068671.txt', '00300135': 'training/9242431.txt', '00300134': 'training/8051172.txt', '00300137': 'training/10330189.txt', '00300136': 'training/10383940.txt', '00300139': 'training/9081693.txt', '00300138': 'training/7964483.txt', '003001209': 'training/10210645.txt', '003001208': 'training/7629157.txt', '003001203': 'training/8443122.txt', '003001202': 'training/2039752.txt', '003001201': 'training/9915863.txt', '003001200': 'training/10452760.txt', '003001207': 'training/2116990.txt', '003001206': 'training/8816454.txt', '003001205': 'training/10202178.txt', '003001204': 'training/8428943.txt', '003001278': 'training/8158122.txt', '003001279': 'training/9175835.txt', '003001373': 'training/9834272.txt', '003001372': 'training/10477716.txt', '003001371': 'training/10233888.txt', '003001370': 'training/8628306.txt', '003001377': 'training/2056282.txt', '003001376': 'training/9185506.txt', '003001375': 'training/8666795.txt', '003001374': 'training/8163464.txt', '003001379': 'training/7890777.txt', '003001378': 'training/10037138.txt', '00300258': 'development/1763325.txt', '00300259': 'development/8622636.txt', '00300254': 'development/1751404.txt', '00300255': 'development/8135780.txt', '00300256': 'development/10096561.txt', '00300257': 'development/10089566.txt', '00300250': 'development/9747720.txt', '00300251': 'development/8388998.txt', '00300252': 'development/9808178.txt', '00300253': 'development/8649779.txt', '003001399': 'training/7865130.txt', '003001398': 'training/1531412.txt', '003001252': 'training/9764907.txt', '003001253': 'training/8790376.txt', '00300179': 'training/8195215.txt', '00300178': 'training/7540578.txt', '003001256': 'training/10426995.txt', '003001257': 'training/7721885.txt', '003001391': 'training/10432288.txt', '003001390': 'training/8504932.txt', '003001393': 'training/9786883.txt', '003001392': 'training/9933632.txt', '003001395': 'training/9882331.txt', '003001394': 'training/8757326.txt', '003001397': 'training/8613707.txt', '003001396': 'training/8605587.txt', '003001165': 'training/8444885.txt', '003001160': 'training/10487715.txt', '003001248': 'training/2216722.txt', '003001159': 'training/9317151.txt', '003001247': 'training/7905504.txt', '00300163': 'training/1911548.txt', '00300160': 'training/7522548.txt', '00300161': 'training/10089140.txt', '003001243': 'training/8852698.txt', '003001242': 'training/8929546.txt', '003001168': 'training/8162052.txt', '00300165': 'training/7718519.txt', '003001226': 'training/9307271.txt', '003001240': 'training/8557975.txt', '003001158': 'training/9277499.txt', '003001153': 'training/8977297.txt', '003001152': 'training/1537556.txt', '003001151': 'training/9276471.txt', '003001150': 'training/9416887.txt', '003001157': 'training/9311921.txt', '003001156': 'training/8386664.txt', '003001155': 'training/10329625.txt', '003001154': 'training/8805630.txt'} TRIAL_KEYS['opinion'] = {'00100159': 'training/20.41.29-15150.txt', '00100158': 'training/11.18.53-27931.txt', '00100151': 'training/10.35.30-7542.txt', '00100150': 'training/wsj_0150.txt', '00100153': 'training/09.53.15-23595.txt', '00100152': 'training/20.20.10-3414.txt', '00100155': 'training/22.04.48-17941.txt', '00100154': 'training/wsj_1073.txt', '00100157': 'training/RindnerBonnie.txt', '00100156': 'training/112C-L014.txt', '001001119': 'training/21.05.07-9115.txt', '001001118': 'training/20.38.49-16233.txt', '001001115': 'training/12.08.27-27397.txt', '001001114': 'training/wsj_0173.txt', '001001117': 'training/21.33.22-7140.txt', '001001116': 'training/21.53.17-27187.txt', '001001111': 'training/21.12.57-28994.txt', '001001110': 'training/03.47.06-11142.txt', '001001113': 'training/20.27.25-21759.txt', '001001112': 'training/08.35.40-23372.txt', '00100146': 'training/11.49.47-8044.txt', '001001195': 'training/20.33.06-778.txt', '001001194': 'training/06.29.16-13721.txt', '001001197': 'training/21.25.32-10485.txt', '001001196': 'training/06.11.16-17420.txt', '0010028': 'development/21.11.57-16690.txt', '0010029': 'development/110CYL200.txt', '001001193': 'training/wsj_0907.txt', '001001192': 'training/20.13.06-23605.txt', '0010024': 'development/20.56.51-26264.txt', '0010025': 'development/21.04.01-4695.txt', '0010026': 'development/sw2071-UTF16-ms98-a-trans.txt', '0010027': 'development/20.41.01-8736.txt', '0010020': 'development/20.33.31-29984.txt', '0010021': 'development/21.37.33-17834.txt', '0010022': 'development/20.26.01-7285.txt', '0010023': 'development/17.58.35-21375.txt', '00100245': 'development/21.11.08-11611.txt', '00100244': 'development/112C-L015.txt', '00100247': 'development/21.24.00-10191.txt', '00100246': 'development/Article247_3500.txt', '00100128': 'training/08.11.35-9355.txt', '00100240': 'development/09.18.38-22306.txt', '00100243': 'development/21.34.10-25509.txt', '00100242': 'development/20.55.32-6296.txt', '00100124': 'training/wsj_0805.txt', '00100125': 'training/20.49.26-27556.txt', '00100126': 'training/20000420_xin_eng-NEW.txt', '00100127': 'training/11.09.24-151.txt', '00100120': 'training/wsj_0551.txt', '00100248': 'development/08.05.55-10723.txt', '00100122': 'training/21.28.50-13504.txt', '00100123': 'training/06.25.38-22700.txt', '001001108': 'training/21.04.32-17074.txt', '001001109': 'training/08.14.55-18533.txt', '001001106': 'training/21.29.08-21533.txt', '001001107': 'training/21.38.12-11637.txt', '001001104': 'training/21.33.09-24778.txt', '001001105': 'training/21.24.50-2535.txt', '001001102': 'training/21.18.25-12166.txt', '001001103': 'training/12.45.44-23455.txt', '001001100': 'training/20.49.23-18398.txt', '001001101': 'training/20.58.35-24209.txt', '00100181': 'training/22.34.49-13286.txt', '0010019': 'training/16.01.33-12919.txt', '0010018': 'training/wsj_0068.txt', '001001184': 'training/01.55.54-27027.txt', '001001185': 'training/16.03.28-26714.txt', '001001182': 'training/ReidSandra.txt', '001001183': 'training/20.23.33-17594.txt', '001001180': 'training/06.29.55-10258.txt', '001001181': 'training/wsj_0760.txt', '0010011': 'training/16.23.04-4326.txt', '0010010': 'training/wsj_0778.txt', '0010013': 'training/AFGP-2002-600045-Trans.txt', '0010012': 'training/21.19.03-816.txt', '0010015': 'training/115CVL037.txt', '0010014': 'training/20.45.56-235.txt', '0010017': 'training/18.09.35-13708.txt', '0010016': 'training/12.30.38-25095.txt', '00100139': 'training/HistoryJerusalem.txt', '00100138': 'training/20.28.43-12622.txt', '00100137': 'training/11.58.59-16532.txt', '00100136': 'training/21.01.08-20603.txt', '00100135': 'training/20.48.51-14201.txt', '00100134': 'training/01.24.22-5235.txt', '00100133': 'training/21.35.14-9737.txt', '00100132': 'training/07.29.33-28852.txt', '00100131': 'training/01.03.02-16670.txt', '00100130': 'training/wsj_0950.txt', '00100230': 'development/SNO-525.txt', '00100231': 'development/20.53.10-28693.txt', '00100232': 'development/06.35.13-26224.txt', '00100233': 'development/22.41.16-14236.txt', '00100234': 'development/wsj_0171.txt', '00100235': 'development/12.22.09-1493.txt', '00100236': 'development/21.05.18-8344.txt', '00100237': 'development/ENRON-pearson-email-25jul02.txt', '00100238': 'development/20.45.06-5529.txt', '00100239': 'development/wsj_0356.txt', '001001139': 'training/12.21.28-26118.txt', '001001138': 'training/22.34.58-23977.txt', '001001186': 'training/17.55.28-22100.txt', '001001187': 'training/13.40.05-15087.txt', '00100218': 'development/20.45.31-22188.txt', '00100219': 'development/21.16.16-1134.txt', '00100223': 'development/21.19.18-13373.txt', '00100222': 'development/08.40.56-18707.txt', '00100221': 'development/13.05.15-16517.txt', '001001127': 'training/Article247_327.txt', '00100227': 'development/20.23.00-9795.txt', '00100226': 'development/21.15.18-84.txt', '001001122': 'training/wsj_0650.txt', '00100224': 'development/18.26.05-8627.txt', '00100182': 'training/21.00.49-23712.txt', '00100183': 'training/15.37.27-28157.txt', '00100229': 'development/20.38.16-13557.txt', '00100228': 'development/10.03.26-15373.txt', '00100186': 'training/20.32.22-8496.txt', '00100187': 'training/21.53.09-11428.txt', '00100184': 'training/20.23.00-14747.txt', '00100185': 'training/wsj_0815.txt', '001001124': 'training/wsj_0695.txt', '001001128': 'training/enron-thread-159550.txt', '001001125': 'training/21.45.32-26215.txt', '00100188': 'training/15.05.17-7881.txt', '00100220': 'development/20.42.17-18974.txt', '001001188': 'training/06.51.18-1222.txt', '001001120': 'training/03.20.43-12807.txt', '001001189': 'training/23.46.20-17835.txt', '001001121': 'training/wsj_0973.txt', '00100115': 'training/21.15.19-21938.txt', '00100114': 'training/12.04.00-12904.txt', '00100117': 'training/23.03.25-11609.txt', '00100116': 'training/115CVL036.txt', '00100111': 'training/20.31.39-22620.txt', '00100225': 'development/21.07.32-24343.txt', '00100113': 'training/08.39.09-12713.txt', '00100112': 'training/20.46.58-22510.txt', '001001123': 'training/20.55.44-16289.txt', '00100119': 'training/22.27.34-26526.txt', '00100118': 'training/110CYL072.txt', '00100216': 'development/21.31.07-25924.txt', '00100217': 'development/13.08.06-1812.txt', '00100214': 'development/wsj_0151.txt', '00100215': 'development/15.08.17-13301.txt', '00100212': 'development/22.11.18-10696.txt', '00100213': 'development/16.15.23-10154.txt', '00100210': 'development/21.09.21-9865.txt', '00100211': 'development/15.11.50-23748.txt', '00100195': 'training/wsj_0144.txt', '00100194': 'training/08.12.40-1611.txt', '00100197': 'training/21.50.57-15245.txt', '00100196': 'training/20.40.44-4958.txt', '00100191': 'training/21.26.55-3999.txt', '00100190': 'training/12.46.57-22041.txt', '00100193': 'training/22.08.22-24562.txt', '00100192': 'training/wsj_0557.txt', '001001151': 'training/wsj_0923.txt', '001001150': 'training/11.05.55-12013.txt', '001001153': 'training/wsj_0266.txt', '001001152': 'training/20000410_nyt-NEW.txt', '001001155': 'training/20.59.14-2538.txt', '001001154': 'training/15.29.53-18099.txt', '001001157': 'training/20.31.22-12363.txt', '001001156': 'training/17.55.53-20579.txt', '001001159': 'training/21.39.02-16166.txt', '001001158': 'training/20.22.42-14586.txt', '001001129': 'training/21.09.23-6361.txt', '00100148': 'training/wsj_0032.txt', '001001191': 'training/21.16.57-11242.txt', '00100149': 'training/20.40.44-29118.txt', '001001190': 'training/114CUL058.txt', '00100198': 'training/21.35.03-11178.txt', '00100160': 'training/wsj_0187.txt', '00100161': 'training/21.37.57-3837.txt', '00100162': 'training/21.20.02-17431.txt', '00100163': 'training/23.39.56-18704.txt', '00100164': 'training/wsj_0811.txt', '00100165': 'training/wsj_0736.txt', '00100166': 'training/wsj_0534.txt', '00100167': 'training/20.28.17-28408.txt', '00100168': 'training/15.26.56-25086.txt', '00100169': 'training/21.49.24-6596.txt', '001001142': 'training/21.44.44-3823.txt', '001001143': 'training/21.12.51-10332.txt', '001001140': 'training/20.52.48-16582.txt', '001001141': 'training/22.32.10-2528.txt', '001001146': 'training/23.25.24-13030.txt', '001001147': 'training/20.04.50-29091.txt', '001001144': 'training/20.23.04-14788.txt', '001001145': 'training/20.45.09-11809.txt', '001001199': 'training/08.27.54-12647.txt', '001001148': 'training/21.27.48-10130.txt', '001001149': 'training/05.19.38-789.txt', '001001131': 'training/20.47.43-22487.txt', '001001198': 'training/21.01.04-6923.txt', '001001130': 'training/20.27.35-29256.txt', '00100189': 'training/21.30.32-24917.txt', '001001137': 'training/22.36.22-2937.txt', '001001136': 'training/21.05.30-9608.txt', '00100147': 'training/14.12.17-21564.txt', '001001135': 'training/20.24.49-8480.txt', '001001134': 'training/21.47.41-28228.txt', '00100173': 'training/wsj_0165.txt', '001001132': 'training/wsj_1035.txt', '00100171': 'training/13.21.23-8227.txt', '00100170': 'training/20.43.13-27926.txt', '00100177': 'training/wsj_0292.txt', '00100176': 'training/20.29.29-13302.txt', '00100175': 'training/15.36.10-18917.txt', '00100174': 'training/23.12.52-21900.txt', '00100241': 'development/06.21.24-4640.txt', '00100179': 'training/wsj_0806.txt', '00100178': 'training/20.32.32-8613.txt', '00100129': 'training/wsj_0555.txt', '001001177': 'training/21.00.22-21144.txt', '001001176': 'training/20.33.43-387.txt', '001001175': 'training/12.07.16-6586.txt', '001001174': 'training/20.37.54-21168.txt', '001001173': 'training/21.31.14-23484.txt', '001001172': 'training/20.43.51-9328.txt', '001001171': 'training/20.52.31-12963.txt', '001001170': 'training/20.34.02-2831.txt', '00100180': 'training/11.38.42-28823.txt', '001001178': 'training/21.07.24-24231.txt', '001001126': 'training/wsj_0527.txt', '001001201': 'training/wsj_0660.txt', '00100121': 'training/06.48.28-6852.txt', '001001168': 'training/04.28.09-24241.txt', '001001169': 'training/00.48.42-17806.txt', '00100144': 'training/08.54.02-18235.txt', '00100145': 'training/118CWL050.txt', '00100142': 'training/20000415_apw_eng-NEW.txt', '00100199': 'training/Article247_66.txt', '00100140': 'training/A1.E2-NEW.txt', '00100141': 'training/20.23.27-26526.txt', '001001160': 'training/21.13.43-28244.txt', '001001161': 'training/03.20.33-11983.txt', '001001162': 'training/21.56.20-863.txt', '001001163': 'training/110CYL071.txt', '001001164': 'training/wsj_0167.txt', '001001165': 'training/12.38.27-3333.txt', '001001166': 'training/20.55.29-11159.txt', '001001167': 'training/20.35.03-16511.txt', '001001179': 'training/20.58.47-19000.txt', '001001133': 'training/wsj_0122.txt', '00100172': 'training/06.12.31-26764.txt', '00100110': 'training/wsj_0160.txt', '001001200': 'training/21.17.08-16542.txt', '001001203': 'training/17.55.10-20068.txt', '001001202': 'training/21.30.26-14869.txt', '00100143': 'training/22.36.40-5626.txt'} TRIAL_KEYS['negation'] = {'0020030': 'evaluation/raw.txt', '0020010': 'training/raw.txt', '0020020': 'development/raw.txt'} REAL_KEYS = {} REAL_KEYS['events'] = {'003003598': 'evaluation/7578250.txt', '003003452': 'evaluation/8875942.txt', '003003454': 'evaluation/10200007.txt', '003003456': 'evaluation/10358178.txt', '003003457': 'evaluation/7636179.txt', '003003458': 'evaluation/7643015.txt', '003003459': 'evaluation/10097788.txt', '003003593': 'evaluation/10208461.txt', '003003595': 'evaluation/7479924.txt', '003003597': 'evaluation/7494272.txt', '003003596': 'evaluation/2378870.txt', '0030011812': 'training/10358756.txt', '0030011827': 'training/9115366.txt', '0030011432': 'training/7961690.txt', '0030011435': 'training/9819151.txt', '0030011346': 'training/9834092.txt', '0030011382': 'training/8799177.txt', '0030011381': 'training/8513868.txt', '00300195': 'training/7645208.txt', '003001611': 'training/9018153.txt', '003002237': 'development/10229275.txt', '0030011544': 'training/7629157.txt', '0030012020': 'training/8985116.txt', '003001156': 'training/9007200.txt', '003002230': 'development/9164841.txt', '0030012808': 'training/10359895.txt', '003003114': 'evaluation/10367897.txt', '003003117': 'evaluation/7478623.txt', '003001208': 'training/1931834.txt', '0030011261': 'training/1375324.txt', '0030011149': 'training/8699118.txt', '0030013059': 'training/9786883.txt', '0030011145': 'training/1656391.txt', '003003442': 'evaluation/8871061.txt', '0030011547': 'training/9715838.txt', '003003119': 'evaluation/7511050.txt', '003003118': 'evaluation/7680653.txt', '0030012807': 'training/8189531.txt', '003003626': 'evaluation/8702466.txt', '0030012064': 'training/9175835.txt', '003003623': 'evaluation/7592671.txt', '003003622': 'evaluation/7719938.txt', '003002594': 'development/7749985.txt', '003003620': 'evaluation/8392092.txt', '003002358': 'development/7488149.txt', '0030012565': 'training/7650486.txt', '0030011752': 'training/8972869.txt', '003001849': 'training/9065737.txt', '0030011933': 'training/9852211.txt', '0030012208': 'training/8184011.txt', '0030013123': 'training/8723387.txt', '003001845': 'training/9616159.txt', '003001844': 'training/9045614.txt', '003001847': 'training/7522257.txt', '003002355': 'development/8913871.txt', '0030012569': 'training/7848921.txt', '003002357': 'development/8388998.txt', '0030012361': 'training/1386962.txt', '0030012360': 'training/7986199.txt', '003001436': 'training/9115242.txt', '003001437': 'training/2234062.txt', '003001539': 'training/8151786.txt', '003001927': 'training/10425206.txt', '003001432': 'training/9398163.txt', '003001433': 'training/2105946.txt', '003001535': 'training/7849291.txt', '003001534': 'training/9819382.txt', '003001537': 'training/1448931.txt', '0030011500': 'training/10202178.txt', '003003692': 'evaluation/7657162.txt', '003003693': 'evaluation/2160380.txt', '003003759': 'evaluation/10077156.txt', '003003691': 'evaluation/1472057.txt', '0030012934': 'training/7890777.txt', '0030012935': 'training/8663174.txt', '0030012937': 'training/10455134.txt', '003003752': 'evaluation/8419644.txt', '003003751': 'evaluation/7613135.txt', '003003757': 'evaluation/10224156.txt', '003003755': 'evaluation/8912888.txt', '003002493': 'development/9796963.txt', '0030011914': 'training/10080875.txt', '003002498': 'development/1335418.txt', '003001467': 'training/1618911.txt', '0030011725': 'training/9096701.txt', '0030012895': 'training/9479628.txt', '00300291': 'development/9488049.txt', '003001691': 'training/9013974.txt', '0030011829': 'training/9840924.txt', '003002127': 'development/7759875.txt', '003001281': 'training/1984449.txt', '003002124': 'development/8108127.txt', '003001287': 'training/10330189.txt', '003001286': 'training/9144338.txt', '003001285': 'training/10383940.txt', '003002129': 'development/9619918.txt', '003001441': 'training/9442377.txt', '003001444': 'training/1777483.txt', '003001449': 'training/7989745.txt', '003002202': 'development/9507734.txt', '0030012299': 'training/7516328.txt', '003001990': 'training/9440546.txt', '003001268': 'training/8504248.txt', '0030012099': 'training/9116279.txt', '003001103': 'training/8797708.txt', '003002100': 'development/9372912.txt', '003001108': 'training/10208867.txt', '00300144': 'training/7742037.txt', '003003650': 'evaluation/1719077.txt', '003002409': 'development/10092783.txt', '0030011964': 'training/8663230.txt', '003003415': 'evaluation/7591095.txt', '0030011968': 'training/7862168.txt', '003002407': 'development/8621390.txt', '00300148': 'training/7542286.txt', '0030012238': 'training/8493578.txt', '003003234': 'evaluation/8387521.txt', '0030011036': 'training/7522304.txt', '003003339': 'evaluation/8387893.txt', '003003231': 'evaluation/8395188.txt', '0030011033': 'training/7569976.txt', '003003335': 'evaluation/10225979.txt', '0030012746': 'training/9915779.txt', '003003337': 'evaluation/10339482.txt', '003003330': 'evaluation/8455941.txt', '003003333': 'evaluation/10027623.txt', '003003411': 'evaluation/10082436.txt', '0030012588': 'training/8264604.txt', '003001666': 'training/9343406.txt', '003001192': 'training/1972889.txt', '003001663': 'training/2112575.txt', '003001190': 'training/9744863.txt', '003001214': 'training/9756417.txt', '003001215': 'training/1986254.txt', '003001217': 'training/7931077.txt', '003001669': 'training/9843840.txt', '003001744': 'training/9886419.txt', '0030012586': 'training/7665588.txt', '0030011866': 'training/9209268.txt', '0030011865': 'training/8757316.txt', '003002279': 'development/7499267.txt', '003002199': 'development/10411007.txt', '003002277': 'development/1719551.txt', '003002274': 'development/10092076.txt', '003002570': 'development/8910577.txt', '003003373': 'evaluation/1423591.txt', '003003481': 'evaluation/7638209.txt', '003003484': 'evaluation/7530239.txt', '003002578': 'development/7769834.txt', '003002565': 'development/8892614.txt', '003003523': 'evaluation/8921937.txt', '0030011104': 'training/9322967.txt', '0030011106': 'training/7565683.txt', '0030011633': 'training/9256234.txt', '0030012632': 'training/9712068.txt', '00300212': 'development/8096091.txt', '0030012739': 'training/10229815.txt', '003003247': 'evaluation/10359012.txt', '0030012733': 'training/8207643.txt', '0030012736': 'training/8973354.txt', '0030012734': 'training/1502202.txt', '0030011260': 'training/9852958.txt', '003003469': 'evaluation/8620546.txt', '003001294': 'training/2123468.txt', '003003702': 'evaluation/8751937.txt', '0030012991': 'training/1765275.txt', '003001633': 'training/10235509.txt', '003003706': 'evaluation/10430944.txt', '003003707': 'evaluation/8505309.txt', '003003463': 'evaluation/8423993.txt', '003003462': 'evaluation/10224470.txt', '003003465': 'evaluation/7482631.txt', '003002314': 'development/7492771.txt', '0030011549': 'training/9057086.txt', '0030012328': 'training/2111447.txt', '003001968': 'training/7657825.txt', '003001966': 'training/2123553.txt', '003001967': 'training/7915519.txt', '0030011543': 'training/1527859.txt', '0030011816': 'training/8011280.txt', '0030011545': 'training/10210645.txt', '0030011546': 'training/7706710.txt', '0030011815': 'training/8557975.txt', '0030011461': 'training/8858156.txt', '0030011465': 'training/8632999.txt', '0030011140': 'training/7688596.txt', '003002568': 'development/1679576.txt', '0030020': 'development/9808189.txt', '003001365': 'training/10347175.txt', '0030011288': 'training/7594489.txt', '003003419': 'evaluation/8289813.txt', '003003414': 'evaluation/7579405.txt', '003003143': 'evaluation/10207061.txt', '003003417': 'evaluation/8970984.txt', '0030011359': 'training/9261181.txt', '003003147': 'evaluation/10187812.txt', '003003144': 'evaluation/2171704.txt', '0030011213': 'training/8444885.txt', '003001496': 'training/7585505.txt', '003001497': 'training/9199464.txt', '003001494': 'training/9312094.txt', '0030012151': 'training/10080948.txt', '003001490': 'training/7718519.txt', '0030012490': 'training/9741337.txt', '0030012996': 'training/9252117.txt', '003002162': 'development/7505113.txt', '0030012495': 'training/8754855.txt', '0030012995': 'training/10210321.txt', '0030011602': 'training/8543789.txt', '0030011601': 'training/2204723.txt', '003002164': 'development/9149909.txt', '003001795': 'training/1431113.txt', '0030012555': 'training/10423406.txt', '0030012998': 'training/9028949.txt', '0030012999': 'training/8504932.txt', '0030012250': 'training/8196620.txt', '003002363': 'development/8649779.txt', '003002362': 'development/9808178.txt', '003002364': 'development/1751404.txt', '0030012770': 'training/8663022.txt', '003001815': 'training/8319574.txt', '003002369': 'development/8135780.txt', '003002368': 'development/9796702.txt', '003001407': 'training/10440930.txt', '003001406': 'training/9247567.txt', '0030012394': 'training/10206983.txt', '003001403': 'training/1946405.txt', '003002392': 'development/8622636.txt', '0030012841': 'training/9834272.txt', '0030012842': 'training/7964516.txt', '0030011620': 'training/9710149.txt', '0030012845': 'training/8163464.txt', '00300248': 'development/8895544.txt', '00300247': 'development/7760807.txt', '003003662': 'evaluation/9510064.txt', '00300245': 'development/7499266.txt', '003001792': 'training/9177217.txt', '00300243': 'development/10395671.txt', '00300143': 'training/10022897.txt', '00300181': 'training/9348104.txt', '0030012198': 'training/1527846.txt', '00300187': 'training/9427533.txt', '0030012196': 'training/9763613.txt', '0030011927': 'training/10318942.txt', '003001380': 'training/7590666.txt', '003002536': 'development/8662960.txt', '003001229': 'training/1510878.txt', '003001387': 'training/7635985.txt', '0030012809': 'training/8664547.txt', '0030011070': 'training/9199300.txt', '003001988': 'training/9394832.txt', '0030011072': 'training/8871623.txt', '003001811': 'training/8419337.txt', '0030011074': 'training/10331989.txt', '00300349': 'evaluation/10356629.txt', '00300346': 'evaluation/10376531.txt', '0030011776': 'training/9863501.txt', '00300340': 'evaluation/7683692.txt', '003001817': 'training/9765295.txt', '0030011670': 'training/7692906.txt', '0030011677': 'training/9307271.txt', '0030013182': 'training/10232385.txt', '003002426': 'development/7759956.txt', '0030013180': 'training/7923175.txt', '003001363': 'training/8179594.txt', '0030013186': 'training/7518803.txt', '003001311': 'training/7594540.txt', '0030012390': 'training/8692924.txt', '003001312': 'training/1532661.txt', '0030011747': 'training/9020049.txt', '0030012258': 'training/9108409.txt', '003001318': 'training/10377075.txt', '003001683': 'training/9442395.txt', '003002472': 'development/9802971.txt', '003003295': 'evaluation/7638186.txt', '003003292': 'evaluation/7592676.txt', '0030013122': 'training/8605348.txt', '003001130': 'training/8602529.txt', '003001134': 'training/8652841.txt', '003003298': 'evaluation/7605996.txt', '003003299': 'evaluation/7647001.txt', '0030012206': 'training/9731208.txt', '003003197': 'evaluation/2214620.txt', '00300110': 'training/10233879.txt', '003002530': 'development/9043959.txt', '003003190': 'evaluation/7642615.txt', '0030012213': 'training/10364260.txt', '003002435': 'development/10090947.txt', '0030012104': 'training/10225377.txt', '003002431': 'development/9794389.txt', '003003208': 'evaluation/10403770.txt', '0030012103': 'training/8186192.txt', '003003563': 'evaluation/1411249.txt', '003003205': 'evaluation/9467376.txt', '003003651': 'evaluation/8676521.txt', '0030012109': 'training/8758898.txt', '003003265': 'evaluation/10330396.txt', '0030012670': 'training/8691123.txt', '003003360': 'evaluation/9024987.txt', '003001639': 'training/9952372.txt', '0030011783': 'training/9343210.txt', '003003364': 'evaluation/8764027.txt', '003002181': 'development/9351352.txt', '003001145': 'training/7935451.txt', '003003368': 'evaluation/10374699.txt', '0030012472': 'training/9748323.txt', '003002185': 'development/8621538.txt', '003002187': 'development/9154298.txt', '003001143': 'training/9109677.txt', '0030013046': 'training/8014029.txt', '003003588': 'evaluation/8702838.txt', '003003589': 'evaluation/2164595.txt', '003003421': 'evaluation/10074432.txt', '003003423': 'evaluation/2255914.txt', '003003583': 'evaluation/7719248.txt', '003001565': 'training/9271352.txt', '003003587': 'evaluation/10378896.txt', '003003585': 'evaluation/8830832.txt', '0030011857': 'training/2216722.txt', '0030011855': 'training/7905504.txt', '0030011921': 'training/9075924.txt', '0030011583': 'training/1372388.txt', '0030011427': 'training/9390691.txt', '003001926': 'training/9271588.txt', '0030011424': 'training/9060666.txt', '0030011588': 'training/10339475.txt', '003003386': 'evaluation/7500028.txt', '0030011421': 'training/9686612.txt', '0030011859': 'training/7969177.txt', '003002549': 'development/7744623.txt', '003002548': 'development/1682217.txt', '0030011392': 'training/7640302.txt', '0030011394': 'training/8960112.txt', '003002228': 'development/9794238.txt', '003003667': 'evaluation/7517211.txt', '003002155': 'development/8887687.txt', '003001925': 'training/9136989.txt', '0030011254': 'training/2122173.txt', '003002544': 'development/10411003.txt', '0030012037': 'training/9625770.txt', '0030011257': 'training/8383677.txt', '003003106': 'evaluation/10228133.txt', '003003107': 'evaluation/10380915.txt', '0030011151': 'training/1958222.txt', '0030011099': 'training/8428966.txt', '003003514': 'evaluation/7719937.txt', '0030011155': 'training/2278044.txt', '003003516': 'evaluation/8704165.txt', '0030011093': 'training/7919963.txt', '003001631': 'training/8015553.txt', '0030011158': 'training/9094628.txt', '0030012607': 'training/8709209.txt', '003003108': 'evaluation/7514630.txt', '0030012605': 'training/8079992.txt', '00300235': 'development/8617979.txt', '00300232': 'development/10395652.txt', '00300233': 'development/8622883.txt', '003001224': 'training/9009221.txt', '003003731': 'evaluation/7744799.txt', '003003732': 'evaluation/7636977.txt', '0030013136': 'training/9442373.txt', '00300238': 'development/9160887.txt', '003003736': 'evaluation/10359894.txt', '0030013032': 'training/10357820.txt', '0030013033': 'training/10432288.txt', '0030012219': 'training/7926759.txt', '0030013037': 'training/9731697.txt', '0030013034': 'training/9933632.txt', '0030013035': 'training/8933518.txt', '003002325': 'development/10402173.txt', '003002324': 'development/8898960.txt', '003001854': 'training/9231664.txt', '0030012216': 'training/2144551.txt', '003002321': 'development/8112299.txt', '003002320': 'development/9353251.txt', '003001850': 'training/9182556.txt', '0030012212': 'training/9872676.txt', '003001917': 'training/10437913.txt', '003001916': 'training/2407588.txt', '0030011458': 'training/9915863.txt', '0030013': 'training/10068588.txt', '003003449': 'evaluation/8877725.txt', '003001910': 'training/8083467.txt', '003001525': 'training/10377411.txt', '003001522': 'training/1896644.txt', '003003355': 'evaluation/10369419.txt', '0030011454': 'training/8491377.txt', '0030011455': 'training/10452760.txt', '003003447': 'evaluation/7673194.txt', '0030012933': 'training/9379002.txt', '0030011328': 'training/9058790.txt', '0030012330': 'training/10226884.txt', '003002520': 'development/9164948.txt', '0030012022': 'training/9032265.txt', '0030011325': 'training/8473495.txt', '003003746': 'evaluation/10228011.txt', '00300158': 'training/10425262.txt', '003003741': 'evaluation/8972019.txt', '0030012929': 'training/7882168.txt', '003002487': 'development/7537762.txt', '003003351': 'evaluation/10092109.txt', '003002482': 'development/9233628.txt', '003003607': 'evaluation/8254185.txt', '003001386': 'training/10070274.txt', '003002488': 'development/10196286.txt', '0030011923': 'training/10197731.txt', '0030011736': 'training/7641319.txt', '0030032': 'evaluation/2217228.txt', '0030012023': 'training/9195127.txt', '0030011731': 'training/7853483.txt', '0030011637': 'training/8662666.txt', '003001147': 'training/7706235.txt', '003001385': 'training/10202937.txt', '0030011313': 'training/10037751.txt', '0030011738': 'training/8562886.txt', '0030012501': 'training/7534663.txt', '003001295': 'training/7964483.txt', '003001359': 'training/9015187.txt', '003001291': 'training/1668145.txt', '0030012506': 'training/8773576.txt', '0030012440': 'training/9649186.txt', '0030036': 'evaluation/7770085.txt', '003001299': 'training/8555489.txt', '0030012969': 'training/7869038.txt', '003001450': 'training/7522548.txt', '00300116': 'training/9310836.txt', '0030012388': 'training/10200294.txt', '0030012813': 'training/8628306.txt', '0030012812': 'training/8943338.txt', '003001590': 'training/7609053.txt', '0030012817': 'training/10233888.txt', '003003324': 'evaluation/2401033.txt', '003001595': 'training/10203577.txt', '003001174': 'training/1537389.txt', '003001170': 'training/7958618.txt', '003003503': 'evaluation/7721747.txt', '003001178': 'training/10454636.txt', '00300156': 'training/9180266.txt', '00300155': 'training/8977228.txt', '00300154': 'training/7888116.txt', '0030011975': 'training/7884865.txt', '00300152': 'training/10087185.txt', '00300151': 'training/9743506.txt', '0030012321': 'training/9130632.txt', '0030011978': 'training/9823774.txt', '0030011027': 'training/7836389.txt', '0030011026': 'training/9808586.txt', '0030011024': 'training/2105528.txt', '0030011742': 'training/8790371.txt', '0030011301': 'training/9893043.txt', '0030011740': 'training/7917514.txt', '0030011269': 'training/1502179.txt', '003003329': 'evaluation/10229837.txt', '003001180': 'training/7848679.txt', '003003327': 'evaluation/10373522.txt', '003001182': 'training/9548490.txt', '003001183': 'training/7892566.txt', '0030011619': 'training/8314792.txt', '003003321': 'evaluation/7678779.txt', '0030012434': 'training/10064103.txt', '0030011345': 'training/9053449.txt', '003001673': 'training/1903417.txt', '0030012433': 'training/7682243.txt', '003001671': 'training/2017177.txt', '0030011833': 'training/8929546.txt', '003001772': 'training/8628295.txt', '003001853': 'training/8641467.txt', '0030011893': 'training/1782151.txt', '0030011891': 'training/9224625.txt', '0030011895': 'training/7525701.txt', '0030011899': 'training/9889197.txt', '0030012500': 'training/10194443.txt', '003002268': 'development/10415075.txt', '003001427': 'training/1505523.txt', '003003559': 'evaluation/10348340.txt', '00300160': 'training/8809111.txt', '003003494': 'evaluation/10364193.txt', '003003495': 'evaluation/10358028.txt', '003002468': 'development/7575565.txt', '0030011290': 'training/9285527.txt', '0030011379': 'training/9824485.txt', '003003493': 'evaluation/8955000.txt', '0030012174': 'training/9265727.txt', '003003557': 'evaluation/8515075.txt', '0030011114': 'training/10477683.txt', '003002466': 'development/8389757.txt', '0030011112': 'training/9416887.txt', '003003553': 'evaluation/2205477.txt', '003003552': 'evaluation/10075873.txt', '0030012646': 'training/9312192.txt', '003002196': 'development/8101106.txt', '003003399': 'evaluation/8458581.txt', '003003398': 'evaluation/7545680.txt', '0030012642': 'training/10202024.txt', '003003157': 'evaluation/1670606.txt', '003003255': 'evaluation/10427971.txt', '003003392': 'evaluation/10072078.txt', '0030012723': 'training/1907460.txt', '003003390': 'evaluation/7668385.txt', '0030012648': 'training/7730624.txt', '0030012649': 'training/10087993.txt', '003003478': 'evaluation/8892903.txt', '003003776': 'evaluation/7663520.txt', '0030013173': 'training/7859290.txt', '0030013172': 'training/7859735.txt', '003003773': 'evaluation/8460169.txt', '003003473': 'evaluation/10087181.txt', '0030013074': 'training/9882331.txt', '003003471': 'evaluation/8441377.txt', '003003476': 'evaluation/10194020.txt', '0030013073': 'training/2146676.txt', '003001953': 'training/8170476.txt', '0030012311': 'training/10082473.txt', '0030011759': 'training/8668213.txt', '003001957': 'training/9388475.txt', '003001956': 'training/7739562.txt', '0030011642': 'training/9219058.txt', '0030012318': 'training/7520914.txt', '0030011556': 'training/1829648.txt', '0030011552': 'training/9201242.txt', '0030011551': 'training/1620119.txt', '003002219': 'development/10092775.txt', '003001706': 'training/8454603.txt', '0030011414': 'training/10329958.txt', '0030012963': 'training/8839844.txt', '0030011412': 'training/2121746.txt', '0030011413': 'training/10430908.txt', '003002213': 'development/8925904.txt', '003001482': 'training/1911548.txt', '0030011364': 'training/9261367.txt', '0030011365': 'training/10072497.txt', '0030011203': 'training/7545467.txt', '0030011207': 'training/9973520.txt', '00300167': 'training/10191934.txt', '003003178': 'evaluation/2159372.txt', '003003176': 'evaluation/8898948.txt', '0030011160': 'training/8386664.txt', '0030011161': 'training/9311921.txt', '0030011166': 'training/9277499.txt', '003003172': 'evaluation/7489741.txt', '003003171': 'evaluation/10381500.txt', '003003609': 'evaluation/7565722.txt', '003003608': 'evaluation/7638173.txt', '0030012826': 'training/10477716.txt', '0030012824': 'training/8816467.txt', '0030012828': 'training/2196387.txt', '0030012829': 'training/10209041.txt', '003002600': 'development/9989503.txt', '00300320': 'evaluation/2398533.txt', '0030012984': 'training/9428796.txt', '003002177': 'development/10395645.txt', '0030012981': 'training/10381501.txt', '0030012516': 'training/1946356.txt', '0030012547': 'training/9374467.txt', '0030012546': 'training/9847292.txt', '003001782': 'training/8600942.txt', '003001780': 'training/9887050.txt', '0030011679': 'training/10233947.txt', '003001826': 'training/7912114.txt', '0030011031': 'training/7802642.txt', '0030012260': 'training/9725220.txt', '0030012261': 'training/9032403.txt', '0030012262': 'training/2148290.txt', '003001820': 'training/10352258.txt', '003001416': 'training/1314139.txt', '003003389': 'evaluation/10318814.txt', '0030012384': 'training/9438495.txt', '003001412': 'training/8631821.txt', '0030012269': 'training/10029589.txt', '0030011524': 'training/10352279.txt', '003001557': 'training/8164652.txt', '003001556': 'training/9078381.txt', '003001555': 'training/8039243.txt', '0030012854': 'training/8666795.txt', '0030012918': 'training/10438731.txt', '0030012850': 'training/2083253.txt', '003003674': 'evaluation/10320367.txt', '0030012912': 'training/10037138.txt', '003003671': 'evaluation/7544001.txt', '003001558': 'training/9101089.txt', '00300192': 'training/8843413.txt', '00300191': 'training/10349513.txt', '00300197': 'training/8441379.txt', '003001230': 'training/1645452.txt', '00300194': 'training/10023774.txt', '0030011937': 'training/8543841.txt', '0030011936': 'training/7969146.txt', '0030012181': 'training/7589085.txt', '0030012180': 'training/7526398.txt', '0030012183': 'training/10378895.txt', '0030011067': 'training/9817603.txt', '0030011065': 'training/9724034.txt', '00300354': 'evaluation/9492977.txt', '0030011061': 'training/9697844.txt', '00300356': 'evaluation/10417333.txt', '0030011707': 'training/7542591.txt', '00300292': 'development/7478994.txt', '0030012967': 'training/10229841.txt', '0030011702': 'training/8186461.txt', '0030011069': 'training/7720085.txt', '00300215': 'development/9032466.txt', '0030013199': 'training/9119025.txt', '0030011660': 'training/9974401.txt', '003001739': 'training/9237716.txt', '003002102': 'development/9880555.txt', '0030012618': 'training/8707445.txt', '0030011669': 'training/9122255.txt', '0030011668': 'training/8455611.txt', '003001731': 'training/8067997.txt', '003001730': 'training/9341756.txt', '0030013195': 'training/10497131.txt', '003001735': 'training/7541794.txt', '003001303': 'training/9081693.txt', '003003506': 'evaluation/1434944.txt', '0030012093': 'training/2304473.txt', '0030012290': 'training/10102628.txt', '003001801': 'training/9032344.txt', '0030012358': 'training/8207793.txt', '003001123': 'training/9858618.txt', '003001126': 'training/7579328.txt', '0030011296': 'training/1945879.txt', '0030011156': 'training/10329625.txt', '0030012253': 'training/9440542.txt', '0030011989': 'training/9115810.txt', '0030012241': 'training/1987353.txt', '003003490': 'evaluation/10373548.txt', '003003184': 'evaluation/1631130.txt', '0030012134': 'training/9191057.txt', '0030012135': 'training/2105887.txt', '0030012137': 'training/10364157.txt', '0030011012': 'training/10329626.txt', '003003284': 'evaluation/7587061.txt', '003002422': 'development/8675228.txt', '0030012571': 'training/9233623.txt', '003003216': 'evaluation/1827138.txt', '0030011018': 'training/1628621.txt', '003002502': 'development/9233802.txt', '0030012203': 'training/9047239.txt', '003003211': 'evaluation/7584520.txt', '0030011795': 'training/9291089.txt', '0030011794': 'training/8657101.txt', '0030012767': 'training/8627791.txt', '003003354': 'evaluation/7706983.txt', '0030011791': 'training/9178107.txt', '0030011790': 'training/9299589.txt', '0030012763': 'training/8627768.txt', '003002221': 'development/7506531.txt', '003001605': 'training/10358154.txt', '0030012769': 'training/7929104.txt', '0030012441': 'training/1896645.txt', '003002194': 'development/9356494.txt', '0030012443': 'training/8137243.txt', '003003436': 'evaluation/7519845.txt', '003003437': 'evaluation/2321018.txt', '003003434': 'evaluation/7618277.txt', '003003430': 'evaluation/7641903.txt', '0030011593': 'training/2006151.txt', '0030011298': 'training/7641692.txt', '0030011591': 'training/7670114.txt', '003003438': 'evaluation/10367698.txt', '003001650': 'training/8761381.txt', '0030012098': 'training/9277478.txt', '0030011840': 'training/8852698.txt', '0030011843': 'training/8872606.txt', '003001831': 'training/7964616.txt', '0030011110': 'training/10327050.txt', '0030011848': 'training/9341193.txt', '003003640': 'evaluation/10103059.txt', '0030012613': 'training/8054477.txt', '003003139': 'evaluation/7602114.txt', '003002537': 'development/10092805.txt', '003002253': 'development/10415609.txt', '003002252': 'development/7479915.txt', '003003133': 'evaluation/2357216.txt', '003003130': 'evaluation/10433370.txt', '003002554': 'development/10416957.txt', '003003134': 'evaluation/8307982.txt', '003003502': 'evaluation/7488016.txt', '0030011088': 'training/10069412.txt', '0030011124': 'training/1537556.txt', '003003501': 'evaluation/10331564.txt', '0030011122': 'training/9276471.txt', '0030011081': 'training/9218843.txt', '0030011420': 'training/9710582.txt', '003001837': 'training/10381655.txt', '0030011128': 'training/7706727.txt', '0030011100': 'training/10206480.txt', '003003645': 'evaluation/2268914.txt', '003003647': 'evaluation/10359138.txt', '0030012942': 'training/9242564.txt', '0030012944': 'training/7925300.txt', '003002346': 'development/9825820.txt', '0030013146': 'training/1939341.txt', '003003727': 'evaluation/10050877.txt', '003003724': 'evaluation/1909740.txt', '0030013143': 'training/2006423.txt', '0030013021': 'training/10022882.txt', '003001420': 'training/8641346.txt', '0030012224': 'training/9188842.txt', '0030013140': 'training/9257843.txt', '0030012226': 'training/7512565.txt', '0030013026': 'training/8507862.txt', '003002336': 'development/8901569.txt', '0030012342': 'training/8135784.txt', '003002330': 'development/7495759.txt', '003001513': 'training/9177216.txt', '0030011448': 'training/9309306.txt', '0030011564': 'training/8573121.txt', '0030012722': 'training/7520093.txt', '0030013051': 'training/9837745.txt', '0030012679': 'training/8605359.txt', '003001514': 'training/1851861.txt', '0030011445': 'training/7659529.txt', '0030011444': 'training/9115394.txt', '003002394': 'development/8134378.txt', '0030011239': 'training/8162052.txt', '003002360': 'development/8877104.txt', '0030011230': 'training/8077662.txt', '0030011338': 'training/8645254.txt', '00300138': 'training/9722600.txt', '003001429': 'training/7843230.txt', '0030011965': 'training/9135552.txt', '0030012448': 'training/10369255.txt', '003001258': 'training/10194184.txt', '00300319': 'evaluation/1347914.txt', '0030012227': 'training/9680181.txt', '00300311': 'evaluation/10229324.txt', '00300310': 'evaluation/7478591.txt', '00300312': 'evaluation/10096574.txt', '0030011339': 'training/9133417.txt', '003002148': 'development/9299399.txt', '003003282': 'evaluation/2283805.txt', '0030013028': 'training/9278334.txt', '003002145': 'development/10092091.txt', '003003202': 'evaluation/10090942.txt', '003002141': 'development/7605990.txt', '003002140': 'development/10092801.txt', '003002142': 'development/9799798.txt', '0030012536': 'training/10485906.txt', '003002388': 'development/9804806.txt', '0030013098': 'training/7534293.txt', '003001626': 'training/1768652.txt', '0030013094': 'training/8809409.txt', '003002382': 'development/10096561.txt', '003001344': 'training/9111040.txt', '003001257': 'training/9242431.txt', '003001342': 'training/8506326.txt', '0030012539': 'training/7594468.txt', '003001341': 'training/9211847.txt', '00300229': 'development/9356353.txt', '003002190': 'development/7486667.txt', '003003388': 'evaluation/7554315.txt', '003001463': 'training/10089140.txt', '00300387': 'evaluation/7541987.txt', '00300384': 'evaluation/8890196.txt', '003003508': 'evaluation/10080544.txt', '00300221': 'development/8106512.txt', '003001585': 'training/8449904.txt', '00300223': 'development/8690900.txt', '00300389': 'evaluation/10328107.txt', '003001393': 'training/9633826.txt', '003001741': 'training/8947512.txt', '0030011566': 'training/9663467.txt', '00300168': 'training/1482376.txt', '003002595': 'development/7747447.txt', '003002588': 'development/1675604.txt', '0030011567': 'training/9223506.txt', '003003779': 'evaluation/8430069.txt', '003002587': 'development/9164919.txt', '0030012512': 'training/7826623.txt', '00300166': 'training/9634075.txt', '003001511': 'training/9919536.txt', '003002583': 'development/1355356.txt', '003002582': 'development/8108414.txt', '00300368': 'evaluation/2280769.txt', '0030011945': 'training/2065663.txt', '0030011284': 'training/9428992.txt', '0030011947': 'training/9199305.txt', '0030011940': 'training/7565732.txt', '0030011192': 'training/2127692.txt', '0030011942': 'training/1313226.txt', '0030011190': 'training/7713868.txt', '0030011054': 'training/2193097.txt', '00300363': 'evaluation/8709636.txt', '00300283': 'development/9794375.txt', '0030011949': 'training/2017258.txt', '0030011050': 'training/10438457.txt', '0030011051': 'training/8804437.txt', '0030011751': 'training/10201929.txt', '003002182': 'development/7510691.txt', '003003316': 'evaluation/8641805.txt', '003003314': 'evaluation/10090931.txt', '0030012400': 'training/8871649.txt', '0030011694': 'training/9047238.txt', '003001645': 'training/8642282.txt', '003001644': 'training/9710600.txt', '003001760': 'training/1653056.txt', '003001888': 'training/1560002.txt', '003001634': 'training/9011569.txt', '003001767': 'training/9306134.txt', '003001882': 'training/8608243.txt', '003001885': 'training/9344365.txt', '0030012745': 'training/9862666.txt', '003001246': 'training/10068671.txt', '0030011881': 'training/9916709.txt', '0030011880': 'training/7890658.txt', '0030011883': 'training/10426995.txt', '003003528': 'evaluation/8206753.txt', '0030011885': 'training/7721885.txt', '0030011887': 'training/1492121.txt', '003003424': 'evaluation/8786324.txt', '0030011889': 'training/9828130.txt', '0030011494': 'training/9446322.txt', '0030012317': 'training/7518838.txt', '0030011492': 'training/7966569.txt', '0030012630': 'training/8479911.txt', '0030011491': 'training/8428943.txt', '003002291': 'development/9211933.txt', '0030013044': 'training/7929355.txt', '0030012741': 'training/8709191.txt', '0030011800': 'training/9299590.txt', '003002298': 'development/8098881.txt', '0030011281': 'training/1944294.txt', '003003251': 'evaluation/10224223.txt', '0030011047': 'training/9187264.txt', '0030012047': 'training/8934542.txt', '0030011282': 'training/10228026.txt', '003002116': 'development/9170401.txt', '0030012161': 'training/10202027.txt', '0030012160': 'training/9834081.txt', '003002513': 'development/10455128.txt', '003002459': 'development/1680914.txt', '003001196': 'training/2026605.txt', '003003546': 'evaluation/10430025.txt', '003003547': 'evaluation/2238523.txt', '0030012169': 'training/7623828.txt', '003003268': 'evaluation/7528668.txt', '003002452': 'development/7760810.txt', '003002451': 'development/8622948.txt', '003003541': 'evaluation/1906155.txt', '003003267': 'evaluation/1705836.txt', '0030012657': 'training/7945272.txt', '0030012656': 'training/7662982.txt', '0030012651': 'training/9398404.txt', '0030012719': 'training/1964088.txt', '0030012653': 'training/9712047.txt', '0030012652': 'training/2302185.txt', '003003384': 'evaluation/7500040.txt', '0030012716': 'training/9765256.txt', '003003387': 'evaluation/8943389.txt', '0030012712': 'training/7664781.txt', '003001660': 'training/7543515.txt', '003003767': 'evaluation/10384094.txt', '003003765': 'evaluation/2367616.txt', '0030011499': 'training/9326236.txt', '003003763': 'evaluation/8319912.txt', '003003760': 'evaluation/9125308.txt', '003003761': 'evaluation/10201899.txt', '0030013065': 'training/8088776.txt', '003003446': 'evaluation/2352328.txt', '003003358': 'evaluation/7565736.txt', '0030013060': 'training/8757326.txt', '003003768': 'evaluation/10025668.txt', '0030012307': 'training/10069428.txt', '0030011198': 'training/7862157.txt', '003001947': 'training/9144218.txt', '0030012580': 'training/9414129.txt', '0030011626': 'training/10430922.txt', '0030012790': 'training/7565811.txt', '003001202': 'training/9428793.txt', '0030012393': 'training/8641353.txt', '0030012308': 'training/1847170.txt', '0030011405': 'training/9813178.txt', '003003338': 'evaluation/10029594.txt', '003003302': 'evaluation/2142528.txt', '0030011403': 'training/8562512.txt', '0030011402': 'training/9070319.txt', '0030011377': 'training/10202034.txt', '003002203': 'development/7747440.txt', '003003303': 'evaluation/7640301.txt', '0030011179': 'training/10487715.txt', '003001654': 'training/9013959.txt', '00300327': 'evaluation/7657645.txt', '0030011272': 'training/8038234.txt', '0030011273': 'training/1493333.txt', '0030011270': 'training/9374642.txt', '0030011271': 'training/9384661.txt', '0030011171': 'training/9317151.txt', '003003169': 'evaluation/10195379.txt', '0030012364': 'training/9442380.txt', '003001656': 'training/1851743.txt', '0030011779': 'training/8663060.txt', '003003618': 'evaluation/7481768.txt', '003003138': 'evaluation/10370372.txt', '003003613': 'evaluation/7692652.txt', '003003610': 'evaluation/10443228.txt', '0030011851': 'training/7637809.txt', '003003614': 'evaluation/10357818.txt', '0030012570': 'training/9032271.txt', '0030013110': 'training/7865130.txt', '0030012572': 'training/10328874.txt', '0030012573': 'training/9376579.txt', '0030013115': 'training/2072454.txt', '0030011002': 'training/10383397.txt', '0030011426': 'training/7927175.txt', '0030013119': 'training/9095577.txt', '0030012579': 'training/7554389.txt', '003001835': 'training/10224109.txt', '0030012274': 'training/7590249.txt', '003002347': 'development/9747720.txt', '0030011538': 'training/2116990.txt', '003001421': 'training/10364191.txt', '0030012371': 'training/9730957.txt', '0030012376': 'training/9209438.txt', '003001426': 'training/7878466.txt', '0030011326': 'training/8816450.txt', '003001540': 'training/2192264.txt', '0030011530': 'training/7532282.txt', '003001542': 'training/10029571.txt', '003001543': 'training/1520341.txt', '0030011534': 'training/8816454.txt', '0030011537': 'training/8175775.txt', '0030012617': 'training/1655897.txt', '00300264': 'development/9352360.txt', '003003719': 'evaluation/10207023.txt', '003003682': 'evaluation/7588326.txt', '0030011858': 'training/9144479.txt', '003001873': 'training/7524762.txt', '0030011303': 'training/9852310.txt', '0030012906': 'training/2056282.txt', '0030012747': 'training/8163658.txt', '0030011305': 'training/1883525.txt', '0030011900': 'training/9052735.txt', '0030011258': 'training/9136080.txt', '00300298': 'development/8626528.txt', '0030011907': 'training/2258623.txt', '00300324': 'evaluation/10101602.txt', '003003239': 'evaluation/7559881.txt', '0030012888': 'training/1899335.txt', '0030011715': 'training/2269427.txt', '0030011714': 'training/8496329.txt', '0030012887': 'training/9873041.txt', '0030011711': 'training/7983701.txt', '0030012882': 'training/7706273.txt', '003001685': 'training/7909357.txt', '0030011650': 'training/9243748.txt', '003002117': 'development/9361029.txt', '0030011652': 'training/8871617.txt', '003001725': 'training/8139041.txt', '003002118': 'development/8098618.txt', '0030011683': 'training/9570512.txt', '0030011659': 'training/8725939.txt', '003001723': 'training/1419905.txt', '0030012031': 'training/10428853.txt', '0030012284': 'training/10329845.txt', '0030012283': 'training/10228008.txt', '0030011398': 'training/8164666.txt', '003003300': 'evaluation/7537687.txt', '0030012289': 'training/9334193.txt', '0030012032': 'training/8702849.txt', '003001119': 'training/8930131.txt', '003001118': 'training/7510689.txt', '003001117': 'training/10079106.txt', '0030012035': 'training/10075645.txt', '003001115': 'training/9052839.txt', '003001112': 'training/10372271.txt', '003001111': 'training/9874515.txt', '0030011255': 'training/8183915.txt', '0030012084': 'training/10446999.txt', '00300130': 'training/7629508.txt', '0030012086': 'training/2394747.txt', '00300132': 'training/8530384.txt', '0030012083': 'training/9858241.txt', '0030012127': 'training/8662845.txt', '003002412': 'development/9360945.txt', '0030011993': 'training/9845517.txt', '0030012688': 'training/10358173.txt', '003003280': 'evaluation/10086725.txt', '003003348': 'evaluation/10080908.txt', '0030011153': 'training/8816395.txt', '0030012693': 'training/1653950.txt', '003002146': 'development/8896456.txt', '003003226': 'evaluation/10369681.txt', '003003225': 'evaluation/2034676.txt', '003003224': 'evaluation/7635572.txt', '0030012699': 'training/10482545.txt', '0030012698': 'training/10233882.txt', '0030012752': 'training/9632764.txt', '003003343': 'evaluation/10087648.txt', '003003344': 'evaluation/10224135.txt', '003003345': 'evaluation/7613138.txt', '0030012757': 'training/1419903.txt', '003001758': 'training/9690455.txt', '003001209': 'training/8816424.txt', '003001613': 'training/7540578.txt', '003001619': 'training/7540942.txt', '003001200': 'training/8480425.txt', '003001755': 'training/1676267.txt', '003001757': 'training/8566023.txt', '003001332': 'training/9199898.txt', '0030011879': 'training/8790376.txt', '003001242': 'training/1315834.txt', '0030011219': 'training/8796372.txt', '0030011874': 'training/8015552.txt', '003002389': 'development/9234696.txt', '0030011876': 'training/9724088.txt', '0030011877': 'training/9764907.txt', '0030013109': 'training/2088505.txt', '003003160': 'evaluation/10024618.txt', '0030011090': 'training/9073544.txt', '003002246': 'development/10403270.txt', '003001970': 'training/1953785.txt', '0030012016': 'training/10330274.txt', '003002240': 'development/7478534.txt', '003002175': 'development/10409763.txt', '003002567': 'development/1740663.txt', '003003125': 'evaluation/10327064.txt', '003003126': 'evaluation/7491389.txt', '0030012061': 'training/10082134.txt', '003002562': 'development/8623933.txt', '003003122': 'evaluation/10384153.txt', '0030011097': 'training/10438913.txt', '0030011135': 'training/8805630.txt', '003002383': 'development/10089566.txt', '003003533': 'evaluation/1404124.txt', '0030011130': 'training/7998962.txt', '0030011133': 'training/8977297.txt', '0030011132': 'training/8577772.txt', '0030012624': 'training/9218534.txt', '0030012626': 'training/9442400.txt', '0030012228': 'training/10102791.txt', '0030013096': 'training/7859743.txt', '003003539': 'evaluation/10421786.txt', '003003538': 'evaluation/7508937.txt', '0030012974': 'training/7594456.txt', '0030012976': 'training/8635523.txt', '003003655': 'evaluation/10224278.txt', '0030013159': 'training/9224203.txt', '0030013198': 'training/8018594.txt', '0030012972': 'training/9119999.txt', '003003718': 'evaluation/7512079.txt', '0030012079': 'training/9130477.txt', '0030013156': 'training/8628274.txt', '0030013151': 'training/9464836.txt', '003003710': 'evaluation/7982959.txt', '0030013016': 'training/1464736.txt', '0030013138': 'training/9334723.txt', '003002307': 'development/8621773.txt', '0030012232': 'training/9032264.txt', '003001787': 'training/1429562.txt', '003001975': 'training/8645086.txt', '0030012611': 'training/10080532.txt', '003001977': 'training/10209036.txt', '0030012069': 'training/8816436.txt', '0030011727': 'training/8523529.txt', '0030011094': 'training/9888865.txt', '00300280': 'development/1712226.txt', '0030012957': 'training/8659190.txt', '0030031': 'evaluation/10101249.txt', '003001505': 'training/8524816.txt', '0030033': 'evaluation/2222774.txt', '003001507': 'training/9707608.txt', '0030011571': 'training/9971788.txt', '003002101': 'development/9733846.txt', '0030039': 'evaluation/8810619.txt', '0030012541': 'training/7843251.txt', '003002514': 'development/9372447.txt', '0030011579': 'training/10426996.txt', '0030011473': 'training/2039752.txt', '0030012420': 'training/7981603.txt', '0030011228': 'training/7788861.txt', '0030011186': 'training/9838061.txt', '003003242': 'evaluation/10078502.txt', '0030011342': 'training/9144472.txt', '0030012543': 'training/9916078.txt', '0030011340': 'training/10438843.txt', '0030011592': 'training/8144878.txt', '003003150': 'evaluation/7576301.txt', '0030011029': 'training/7578980.txt', '003003152': 'evaluation/10231345.txt', '003003154': 'evaluation/7678994.txt', '003003405': 'evaluation/7543512.txt', '0030011215': 'training/8649822.txt', '003001485': 'training/8428000.txt', '0030013130': 'training/10064064.txt', '003001483': 'training/9130512.txt', '003002373': 'development/8903467.txt', '0030011804': 'training/10101034.txt', '00300385': 'evaluation/10359457.txt', '0030012492': 'training/9121455.txt', '003003164': 'evaluation/7579399.txt', '003003578': 'evaluation/2386492.txt', '0030012951': 'training/1956769.txt', '0030012485': 'training/1454801.txt', '003002158': 'development/1700011.txt', '0030011690': 'training/8468462.txt', '0030012523': 'training/9716600.txt', '0030012522': 'training/9317131.txt', '0030013134': 'training/9729045.txt', '0030013083': 'training/8605587.txt', '003001372': 'training/8978306.txt', '0030012240': 'training/9188651.txt', '0030013080': 'training/7543076.txt', '003002398': 'development/9171108.txt', '003001371': 'training/1981844.txt', '003001809': 'training/7673240.txt', '003002395': 'development/9159166.txt', '0030013089': 'training/1380242.txt', '0030013088': 'training/8613707.txt', '003002390': 'development/1763325.txt', '003002564': 'development/8524232.txt', '003001581': 'training/9733716.txt', '0030013093': 'training/1531412.txt', '00300258': 'development/10101001.txt', '00300390': 'evaluation/10438758.txt', '00300393': 'evaluation/10352273.txt', '0030012661': 'training/8691127.txt', '0030012875': 'training/9185506.txt', '00300255': 'development/8621480.txt', '00300256': 'development/9878621.txt', '00300399': 'evaluation/7565675.txt', '003002328': 'development/9166418.txt', '003001250': 'training/8670269.txt', '0030013160': 'training/8062448.txt', '0030011959': 'training/9783909.txt', '0030011958': 'training/8436816.txt', '00300178': 'training/7864072.txt', '003001256': 'training/8051172.txt', '003003129': 'evaluation/7707514.txt', '0030011953': 'training/8747083.txt', '0030012383': 'training/9031085.txt', '00300121': 'training/2023633.txt', '003001600': 'training/7823943.txt', '0030012215': 'training/10233927.txt', '00300379': 'evaluation/10443688.txt', '0030011049': 'training/9032280.txt', '0030012381': 'training/8853898.txt', '0030012126': 'training/9835626.txt', '003001949': 'training/9720648.txt', '00300370': 'evaluation/2320113.txt', '0030011041': 'training/9209284.txt', '0030011040': 'training/2172166.txt', '003001163': 'training/7622191.txt', '0030011814': 'training/1531086.txt', '003003306': 'evaluation/7604283.txt', '003003307': 'evaluation/8910360.txt', '003001658': 'training/9400372.txt', '0030011762': 'training/8960365.txt', '0030011846': 'training/7949138.txt', '0030011687': 'training/8052854.txt', '003001852': 'training/1533884.txt', '0030011685': 'training/8617207.txt', '0030013069': 'training/9419430.txt', '0030012248': 'training/9442374.txt', '0030012410': 'training/8298127.txt', '003001205': 'training/9190901.txt', '003001713': 'training/2109187.txt', '003001712': 'training/10221643.txt', '003001892': 'training/2237444.txt', '00300253': 'development/1990263.txt', '003001890': 'training/8666783.txt', '003001718': 'training/10229820.txt', '0030013152': 'training/9277450.txt', '003003369': 'evaluation/10201984.txt', '003003287': 'evaluation/7591091.txt', '0030011489': 'training/2314899.txt', '0030011488': 'training/7858491.txt', '003003283': 'evaluation/7605997.txt', '0030011000': 'training/10491412.txt', '003001243': 'training/1658795.txt', '0030012427': 'training/1470918.txt', '0030011481': 'training/8443122.txt', '0030012356': 'training/8018558.txt', '003003722': 'evaluation/1453013.txt', '003003220': 'evaluation/2156043.txt', '0030012399': 'training/1545132.txt', '003002287': 'development/9852070.txt', '003002284': 'development/8626752.txt', '003003378': 'evaluation/10438903.txt', '0030012057': 'training/1583734.txt', '0030012759': 'training/8325322.txt', '003003214': 'evaluation/10435586.txt', '003002289': 'development/1740667.txt', '0030012663': 'training/8759721.txt', '0030012050': 'training/8158122.txt', '003002440': 'development/7747417.txt', '0030012117': 'training/9311830.txt', '003003571': 'evaluation/8871608.txt', '0030011007': 'training/7523507.txt', '003002446': 'development/10229231.txt', '003002447': 'development/8561779.txt', '003003270': 'evaluation/8955173.txt', '003003670': 'evaluation/1421207.txt', '003003272': 'evaluation/8283032.txt', '0030012413': 'training/8634413.txt', '0030012665': 'training/10480426.txt', '003003277': 'evaluation/9031090.txt', '003003371': 'evaluation/9728057.txt', '003001159': 'training/9712026.txt', '0030012370': 'training/10217534.txt', '003003375': 'evaluation/2161813.txt', '0030013190': 'training/9074948.txt', '003001153': 'training/9341877.txt', '003001620': 'training/8195215.txt', '0030012709': 'training/9878608.txt', '0030012461': 'training/10233875.txt', '003001157': 'training/1541828.txt', '003003229': 'evaluation/10092825.txt', '003001627': 'training/8739563.txt', '0030012465': 'training/10221658.txt'} REAL_KEYS['opinion'] = {'001001699': 'training/20.54.07-4914.txt', '001001692': 'training/09.35.06-27851.txt', '001001693': 'training/21.15.53-1323.txt', '001001690': 'training/21.13.43-28244.txt', '001001691': 'training/Article247_400.txt', '001001696': 'training/03.20.33-11983.txt', '001001697': 'training/21.56.20-863.txt', '001001420': 'training/20.40.44-4958.txt', '00100313': 'evaluation/21.10.31-12974.txt', '00100312': 'evaluation/11.14.43-703.txt', '00100311': 'evaluation/08.54.29-27700.txt', '00100310': 'evaluation/20.36.01-11616.txt', '00100317': 'evaluation/22.09.32-24118.txt', '00100316': 'evaluation/21.07.05-24942.txt', '00100315': 'evaluation/20.41.16-27812.txt', '00100314': 'evaluation/21.09.29-14261.txt', '00100318': 'evaluation/117CWL009.txt', '001001591': 'training/20.37.28-19761.txt', '001002159': 'development/20.37.23-17648.txt', '001001592': 'training/08.46.28-13637.txt', '001001597': 'training/20.24.49-8480.txt', '001002151': 'development/wsj_0168.txt', '001002153': 'development/09.18.38-22306.txt', '001002154': 'development/20.31.55-8089.txt', '001002155': 'development/06.21.24-4640.txt', '001002156': 'development/20.55.32-6296.txt', '001002157': 'development/21.34.10-25509.txt', '001001600': 'training/21.19.21-15525.txt', '001001603': 'training/22.34.58-23977.txt', '001001602': 'training/22.36.22-2937.txt', '001001212': 'training/wsj_0150.txt', '001001213': 'training/21.03.47-22424.txt', '001001607': 'training/20.52.48-16582.txt', '001001606': 'training/12.21.28-26118.txt', '001001357': 'training/22.17.52-18926.txt', '001001608': 'training/15.45.31-12608.txt', '001001353': 'training/20.41.17-4020.txt', '001001352': 'training/23.12.52-21900.txt', '001001218': 'training/06.44.52-19992.txt', '001001829': 'training/19.59.31-21964.txt', '001001828': 'training/23.46.20-17835.txt', '001001588': 'training/21.47.41-28228.txt', '001001825': 'training/06.56.31-3120.txt', '001001140': 'training/21.35.14-9737.txt', '001001826': 'training/21.59.17-14271.txt', '001001147': 'training/20.42.47-22260.txt', '001003236': 'evaluation/wsj_0991.txt', '001003234': 'evaluation/20.37.48-18053.txt', '001003232': 'evaluation/AFGP-2002-600002-Trans.txt', '001003233': 'evaluation/21.14.53-15562.txt', '001003230': 'evaluation/20.32.26-14925.txt', '001003231': 'evaluation/20.35.17-19238.txt', '001001509': 'training/00.31.31-4544.txt', '001001859': 'training/20.48.58-26376.txt', '001001502': 'training/20.27.25-21759.txt', '001001856': 'training/23.01.07-15764.txt', '001001501': 'training/21.47.55-9546.txt', '001001506': 'training/12.08.27-27397.txt', '001001504': 'training/wsj_0173.txt', '001001505': 'training/20.41.29-25820.txt', '001003199': 'evaluation/00.16.28-8800.txt', '001003198': 'evaluation/wsj_0332.txt', '001003195': 'evaluation/11.08.55-4179.txt', '001003194': 'evaluation/sw2025-ms98-a-trans.ascii-1-NEW.txt', '001003191': 'evaluation/20.33.16-3417.txt', '001001132': 'training/22.37.46-10374.txt', '001001136': 'training/01.03.02-16670.txt', '001001139': 'training/07.29.33-28852.txt', '001001546': 'training/21.16.02-13111.txt', '001001547': 'training/20.55.44-16289.txt', '001003120': 'evaluation/20.46.06-8042.txt', '001003288': 'evaluation/118CWL048.txt', '001003124': 'evaluation/23.47.18-27024.txt', '001003125': 'evaluation/20.50.06-12228.txt', '001003283': 'evaluation/13.24.42-23228.txt', '001001542': 'training/20.45.48-7720.txt', '001003281': 'evaluation/21.50.13-28912.txt', '001003280': 'evaluation/21.54.31-15463.txt', '001003285': 'evaluation/22.00.23-1236.txt', '001003284': 'evaluation/23.18.15-25073.txt', '001001540': 'training/wsj_0973.txt', '001001438': 'training/wsj_1042.txt', '001001439': 'training/15.59.08-16874.txt', '001001432': 'training/21.04.46-9278.txt', '001001433': 'training/20.57.02-23111.txt', '001001430': 'training/06.28.56-23638.txt', '00100220': 'development/21.04.01-4695.txt', '001001598': 'training/21.05.30-9608.txt', '001001549': 'training/wsj_0695.txt', '001001799': 'training/wsj_0135.txt', '00100358': 'evaluation/20.36.05-804.txt', '00100357': 'evaluation/11.11.15-6639.txt', '00100356': 'evaluation/118CWL049.txt', '00100355': 'evaluation/15.16.19-1897.txt', '00100354': 'evaluation/20.44.41-9757.txt', '001001793': 'training/20.58.47-19000.txt', '00100216': 'development/21.31.56-18015.txt', '00100214': 'development/20.26.01-7285.txt', '00100215': 'development/17.58.35-21375.txt', '00100198': 'training/20.46.37-70.txt', '00100371': 'evaluation/wsj_0570.txt', '00100196': 'training/wsj_0805.txt', '00100191': 'training/21.28.50-13504.txt', '00100229': 'development/21.11.57-16690.txt', '00100193': 'training/06.25.38-22700.txt', '00100192': 'training/20.50.33-2917.txt', '001001319': 'training/wsj_0321.txt', '001001318': 'training/15.26.56-25086.txt', '001002110': 'development/SNO-525.txt', '001002111': 'development/22.26.42-7148.txt', '001001645': 'training/11.57.16-2305.txt', '001001311': 'training/03.19.43-8352.txt', '001001646': 'training/04.22.14-2532.txt', '001001317': 'training/21.42.46-13715.txt', '001001316': 'training/20.20.54-21851.txt', '001001314': 'training/20.28.17-28408.txt', '001001723': 'training/20.55.29-11159.txt', '001001489': 'training/08.35.40-23372.txt', '001001724': 'training/20.35.03-16511.txt', '001001725': 'training/114CUL059.txt', '001001483': 'training/03.47.06-11142.txt', '001001605': 'training/wsj_1033.txt', '001001481': 'training/wsj_0667.txt', '001001729': 'training/04.28.09-24241.txt', '001001486': 'training/21.12.57-28994.txt', '001001865': 'training/06.39.11-29583.txt', '001001864': 'training/wsj_0161.txt', '001001860': 'training/06.11.16-17420.txt', '00100379': 'evaluation/20.37.28-1540.txt', '001001869': 'training/21.24.12-20558.txt', '001001609': 'training/wsj_0583.txt', '00100284': 'development/12.26.48-19550.txt', '00100287': 'development/23.54.13-6261.txt', '001001894': 'training/20.53.55-17301.txt', '001001896': 'training/11.35.22-9439.txt', '001001897': 'training/20.55.56-1089.txt', '001002161': 'development/21.11.08-11611.txt', '001002160': 'development/112C-L015.txt', '001001544': 'training/16.03.54-17435.txt', '001002165': 'development/20.22.38-4806.txt', '001001543': 'training/wsj_0650.txt', '001002167': 'development/21.57.16-1319.txt', '001001268': 'training/20.42.08-18791.txt', '001002169': 'development/Article247_3500.txt', '001001266': 'training/20.41.29-15150.txt', '001001265': 'training/11.18.53-27931.txt', '001001263': 'training/21.31.10-15710.txt', '001001261': 'training/wsj_0157.txt', '001001260': 'training/20.53.38-28377.txt', '001003168': 'evaluation/21.25.31-10464.txt', '001003169': 'evaluation/11.21.37-22256.txt', '001003167': 'evaluation/wsj_1038.txt', '001003165': 'evaluation/21.33.44-18068.txt', '001003162': 'evaluation/20.48.23-2528.txt', '001001324': 'training/21.49.24-6596.txt', '001003247': 'evaluation/08.47.00-17401.txt', '001003246': 'evaluation/20.59.27-22386.txt', '001003245': 'evaluation/116CUL034.txt', '001003244': 'evaluation/20.48.00-11907.txt', '001003243': 'evaluation/112C-L013.txt', '001003242': 'evaluation/wsj_0713.txt', '001003241': 'evaluation/20.31.54-28680.txt', '001001322': 'training/20.43.49-8525.txt', '001003248': 'evaluation/IZ-060316-01-Trans-1.txt', '001001663': 'training/17.55.53-20579.txt', '001001321': 'training/10.24.29-21670.txt', '00100121': 'training/20.45.56-235.txt', '001001168': 'training/20.28.43-12622.txt', '001001169': 'training/HistoryJerusalem.txt', '001001163': 'training/117CWL008.txt', '001001165': 'training/11.08.41-17418.txt', '001001167': 'training/21.18.24-28147.txt', '001003119': 'evaluation/wsj_0904.txt', '001003118': 'evaluation/22.23.02-12197.txt', '00100159': 'training/wsj_0316.txt', '00100253': 'development/12.03.26-29353.txt', '00100250': 'development/15.08.17-13301.txt', '00100251': 'development/21.00.37-23670.txt', '00100256': 'development/21.31.07-25924.txt', '00100257': 'development/20.31.05-16359.txt', '00100254': 'development/113CWL018.txt', '00100151': 'training/20.31.39-22620.txt', '00100258': 'development/wsj_0661.txt', '001001296': 'training/110CYL067.txt', '001001293': 'training/15.01.39-17844.txt', '001001290': 'training/wsj_0073.txt', '00100397': 'evaluation/wsj_0176.txt', '001001768': 'training/20.34.52-3092.txt', '001001764': 'training/20.37.54-21168.txt', '001001762': 'training/21.22.09-18839.txt', '001001763': 'training/21.31.14-23484.txt', '001001194': 'training/119CWL041.txt', '001001197': 'training/20.18.05-14490.txt', '001001196': 'training/110CYL068.txt', '0010028': 'development/21.37.33-17834.txt', '0010029': 'development/20.59.38-27990.txt', '001001193': 'training/20.36.11-21342.txt', '001001192': 'training/14.12.17-21564.txt', '0010025': 'development/21.03.13-21992.txt', '0010027': 'development/20.41.31-29293.txt', '001001198': 'training/wsj_0032.txt', '0010022': 'development/20.33.31-29984.txt', '001001448': 'training/21.18.25-12166.txt', '001001458': 'training/20.38.59-25700.txt', '00100325': 'evaluation/20.32.39-1919.txt', '00100323': 'evaluation/21.45.53-9610.txt', '00100328': 'evaluation/10.12.58-29108.txt', '00100329': 'evaluation/wsj_0324.txt', '001001583': 'training/wsj_0122.txt', '001001228': 'training/10.35.30-7542.txt', '001002128': 'development/22.41.16-14236.txt', '001001584': 'training/21.01.18-7143.txt', '001002127': 'development/sw2015-ms98-a-trans.txt', '001002126': 'development/06.35.13-26224.txt', '001002123': 'development/20.53.10-28693.txt', '001002122': 'development/20.43.16-10786.txt', '001001612': 'training/22.32.10-2528.txt', '001001342': 'training/06.20.40-8369.txt', '001001343': 'training/04.08.51-29183.txt', '001001345': 'training/21.52.21-20248.txt', '001001347': 'training/12.55.04-23296.txt', '001001348': 'training/21.05.45-10422.txt', '001001349': 'training/06.12.31-26764.txt', '001001619': 'training/20.42.42-156.txt', '001001838': 'training/21.13.16-13370.txt', '001001830': 'training/114CUL058.txt', '001001837': 'training/20.13.06-23605.txt', '001001835': 'training/21.16.57-11242.txt', '001003202': 'evaluation/wsj_0568.txt', '001003200': 'evaluation/wsj_0175.txt', '001003207': 'evaluation/20.43.10-5176.txt', '001003205': 'evaluation/wsj_0340.txt', '001001539': 'training/03.20.43-12807.txt', '001001538': 'training/21.45.36-19604.txt', '001001534': 'training/21.05.07-9115.txt', '001001532': 'training/12.03.37-20669.txt', '001003181': 'evaluation/20.36.59-7133.txt', '001003182': 'evaluation/11.41.36-1407.txt', '001003184': 'evaluation/23.54.01-6168.txt', '001003185': 'evaluation/21.13.42-3998.txt', '001003187': 'evaluation/wsj_0928.txt', '001003188': 'evaluation/07.05.30-9348.txt', '001001125': 'training/09.11.12-16761.txt', '001001122': 'training/21.26.05-29959.txt', '001001128': 'training/20.53.10-15177.txt', '001001129': 'training/wsj_0950.txt', '001001156': 'training/wsj_0685.txt', '001003116': 'evaluation/21.31.45-20536.txt', '001003112': 'evaluation/20.43.50-5456.txt', '001003111': 'evaluation/12.09.12-23643.txt', '0010016': 'training/wsj_0778.txt', '001003291': 'evaluation/22.07.25-26605.txt', '001003292': 'evaluation/08.07.09-11180.txt', '001003293': 'evaluation/wsj_1039.txt', '00100114': 'training/15.04.33-20423.txt', '00100116': 'training/AFGP-2002-600045-Trans.txt', '00100111': 'training/21.19.03-816.txt', '00100110': 'training/wsj_0768.txt', '00100119': 'training/08.22.04-29634.txt', '00100118': 'training/wsj_0541.txt', '001001393': 'training/20.23.00-14747.txt', '001001392': 'training/15.37.27-28157.txt', '001001391': 'training/01.09.50-14694.txt', '001001390': 'training/20.26.14-23928.txt', '001001396': 'training/21.16.15-1122.txt', '001001394': 'training/wsj_0815.txt', '001001399': 'training/21.53.09-11428.txt', '001001398': 'training/20.32.22-8496.txt', '001001403': 'training/wsj_0152.txt', '001001402': 'training/15.05.17-7881.txt', '001001406': 'training/21.26.55-3999.txt', '001001405': 'training/12.46.57-22041.txt', '001001404': 'training/21.30.32-24917.txt', '001001159': 'training/15.12.22-6729.txt', '001001158': 'training/11.52.35-10118.txt', '001001408': 'training/wsj_0557.txt', '001001885': 'training/21.37.20-19607.txt', '001001259': 'training/RindnerBonnie.txt', '001001233': 'training/21.17.18-25137.txt', '001001551': 'training/21.00.59-25256.txt', '001001780': 'training/21.47.26-20990.txt', '001001782': 'training/20.34.33-18786.txt', '001001783': 'training/22.11.01-7259.txt', '00100367': 'evaluation/21.00.30-21204.txt', '001001786': 'training/15.20.41-10497.txt', '001001787': 'training/01.48.48-11084.txt', '00100368': 'evaluation/20.55.24-19278.txt', '00100163': 'training/21.05.41-14545.txt', '00100165': 'training/20.44.27-28756.txt', '00100166': 'training/20.46.58-22510.txt', '00100167': 'training/08.39.09-12713.txt', '001001308': 'training/wsj_0534.txt', '001001309': 'training/14.06.39-26143.txt', '001001659': 'training/15.29.53-18099.txt', '001001656': 'training/ch5.txt', '001001655': 'training/21.21.46-15196.txt', '001001300': 'training/21.09.25-22686.txt', '001001301': 'training/CNN_AARONBROWN_ENG_20051101_215800.partial-NEW.txt', '001001650': 'training/wsj_0923.txt', '001001303': 'training/22.24.21-25298.txt', '001001716': 'training/12.38.27-3333.txt', '001001498': 'training/04.51.05-27505.txt', '001001712': 'training/13.44.36-19236.txt', '001001711': 'training/wsj_0167.txt', '001001710': 'training/21.40.05-5224.txt', '001001495': 'training/21.29.47-14352.txt', '001001497': 'training/wsj_0751.txt', '001001718': 'training/03.23.00-19213.txt', '001001875': 'training/08.27.54-12647.txt', '001001872': 'training/21.25.32-10485.txt', '001001873': 'training/21.01.04-6923.txt', '00100370': 'evaluation/08.01.38-4843.txt', '001001878': 'training/21.17.08-16542.txt', '001001571': 'training/20.27.21-24397.txt', '001001570': 'training/08.10.09-13801.txt', '001001575': 'training/wsj_1035.txt', '001001574': 'training/22.37.03-25968.txt', '001002172': 'development/08.05.55-10723.txt', '001002173': 'development/20.59.39-4666.txt', '001002170': 'development/21.24.00-10191.txt', '001001273': 'training/21.37.57-3837.txt', '001001274': 'training/12.44.14-20223.txt', '001001276': 'training/21.44.41-4066.txt', '001001277': 'training/12.14.07-203.txt', '001001278': 'training/21.20.02-17431.txt', '001001373': 'training/wsj_0292.txt', '001002179': 'development/20.23.54-19638.txt', '001001375': 'training/20.20.40-16093.txt', '001001374': 'training/20.32.32-8613.txt', '001001377': 'training/wsj_0806.txt', '00100142': 'training/20.58.08-21315.txt', '001003152': 'evaluation/wsj_2465.txt', '001003157': 'evaluation/21.23.57-11596.txt', '001003155': 'evaluation/20.25.51-11532.txt', '001003154': 'evaluation/21.05.06-7601.txt', '001003251': 'evaluation/20.48.55-4975.txt', '001003252': 'evaluation/06.47.23-22498.txt', '001003158': 'evaluation/21.11.01-23492.txt', '001003254': 'evaluation/20.41.37-18755.txt', '00100385': 'evaluation/21.21.31-27111.txt', '001003256': 'evaluation/20.37.46-24515.txt', '00100387': 'evaluation/20.45.06-11781.txt', '001002137': 'development/ENRON-pearson-email-25jul02.txt', '001002134': 'development/21.05.18-8344.txt', '00100158': 'training/20.35.25-594.txt', '00100393': 'evaluation/20.57.35-19171.txt', '001001119': 'training/21.53.09-20885.txt', '001001118': 'training/20.55.06-13183.txt', '001001115': 'training/00.42.05-29788.txt', '001001117': 'training/wsj_1640.mrg-NEW.txt', '001001116': 'training/21.22.54-17837.txt', '001001110': 'training/wsj_0555.txt', '001003289': 'evaluation/wsj_0762.txt', '001001331': 'training/wsj_0610.txt', '00100244': 'development/12.19.35-18077.txt', '00100246': 'development/20.59.09-4817.txt', '00100128': 'training/20.36.23-26588.txt', '00100243': 'development/21.03.06-12522.txt', '00100242': 'development/22.11.18-10696.txt', '00100124': 'training/115CVL037.txt', '00100249': 'development/wsj_0151.txt', '00100248': 'development/16.15.23-10154.txt', '001003128': 'evaluation/08.36.15-7509.txt', '001001753': 'training/21.00.52-4167.txt', '001001459': 'training/20.53.19-28892.txt', '001001751': 'training/20.19.13-15223.txt', '001001450': 'training/21.33.09-24778.txt', '001001451': 'training/wsj_0136.txt', '001001759': 'training/21.20.14-9569.txt', '001001453': 'training/14.06.40-17312.txt', '001001454': 'training/21.45.03-5180.txt', '001001455': 'training/21.24.50-2535.txt', '001001186': 'training/118CWL050.txt', '001001187': 'training/02.01.27-21386.txt', '001001181': 'training/08.54.02-18235.txt', '0010010': 'training/21.16.03-15717.txt', '0010012': 'training/20.52.55-19163.txt', '0010014': 'training/20.46.39-9348.txt', '0010017': 'training/16.23.04-4326.txt', '001001189': 'training/11.49.47-8044.txt', '001001652': 'training/20000410_nyt-NEW.txt', '00100332': 'evaluation/A1.E1-NEW.txt', '00100335': 'evaluation/22.24.21-23558.txt', '00100336': 'evaluation/Article247_328.txt', '00100230': 'development/110CYL200.txt', '001003220': 'evaluation/20.53.05-28623.txt', '00100236': 'development/21.09.21-9865.txt', '00100237': 'development/15.11.50-23748.txt', '00100239': 'development/wsj_0325.txt', '001001651': 'training/21.10.42-27754.txt', '001001239': 'training/09.53.15-23595.txt', '001002139': 'development/20.45.06-5529.txt', '001001234': 'training/20.43.33-11456.txt', '001001235': 'training/20.20.10-3414.txt', '001001236': 'training/08.21.04-13527.txt', '001002135': 'development/12.04.49-21742.txt', '001001231': 'training/04.24.14-26782.txt', '001002130': 'development/wsj_0171.txt', '001002131': 'development/12.22.09-1493.txt', '001001337': 'training/13.21.23-8227.txt', '001001336': 'training/20.27.05-27044.txt', '001001623': 'training/21.44.44-3823.txt', '001001330': 'training/03.17.23-6711.txt', '001001333': 'training/20.43.13-27926.txt', '001001332': 'training/20.45.17-5753.txt', '001001629': 'training/08.03.23-18607.txt', '001001628': 'training/21.12.51-10332.txt', '001001809': 'training/16.03.28-26714.txt', '001001808': 'training/01.55.54-27027.txt', '001001807': 'training/20.23.33-17594.txt', '001001806': 'training/05.20.33-11163.txt', '001001802': 'training/ReidSandra.txt', '001001801': 'training/wsj_0760.txt', '00100398': 'evaluation/12.07.32-18094.txt', '00100221': 'development/sw2071-UTF16-ms98-a-trans.txt', '001003215': 'evaluation/20.50.30-97.txt', '001003216': 'evaluation/112C-L016.txt', '001003217': 'evaluation/18.21.31-1735.txt', '001003210': 'evaluation/20.28.15-21486.txt', '001003213': 'evaluation/12.36.18-4189.txt', '001003143': 'evaluation/21.15.10-2135.txt', '001003219': 'evaluation/15.13.14-22330.txt', '001001520': 'training/21.53.17-27187.txt', '001001521': 'training/08.06.09-13335.txt', '001001525': 'training/21.33.22-7140.txt', '001001528': 'training/20.38.49-16233.txt', '001001240': 'training/wsj_1073.txt', '001001242': 'training/15.13.45-21190.txt', '001001244': 'training/22.04.48-17941.txt', '00100390': 'evaluation/20.37.52-21155.txt', '001003148': 'evaluation/15.36.00-16525.txt', '001003149': 'evaluation/20.21.54-4382.txt', '001003222': 'evaluation/22.11.06-28210.txt', '001001555': 'training/20.47.22-10067.txt', '001001518': 'training/21.25.24-12160.txt', '001003101': 'evaluation/20.57.00-17276.txt', '001001653': 'training/wsj_0266.txt', '001001445': 'training/20.58.35-24209.txt', '001003106': 'evaluation/20.24.37-12857.txt', '001003107': 'evaluation/21.28.14-6829.txt', '001003263': 'evaluation/20.49.56-27748.txt', '001001444': 'training/115CVL035.txt', '001003267': 'evaluation/20.26.52-10078.txt', '001003266': 'evaluation/06.10.04-18139.txt', '001001554': 'training/wsj_0527.txt', '001001190': 'training/20000416_xin_eng-NEW.txt', '001001384': 'training/22.34.49-13286.txt', '001001382': 'training/114CUL060.txt', '001001383': 'training/11.38.42-28823.txt', '001001388': 'training/21.00.49-23712.txt', '001001389': 'training/wsj_0558.txt', '001001142': 'training/01.24.22-5235.txt', '001001143': 'training/20.48.51-14201.txt', '001001416': 'training/08.12.40-1611.txt', '001001146': 'training/11.58.59-16532.txt', '001001411': 'training/22.08.22-24562.txt', '001001144': 'training/21.01.08-20603.txt', '001001413': 'training/20.58.51-26741.txt', '001001515': 'training/00.39.51-8366.txt', '001001148': 'training/20000424_nyt-NEW.txt', '001001418': 'training/20.47.35-12690.txt', '001001419': 'training/wsj_0144.txt', '001001449': 'training/12.45.44-23455.txt', '001001325': 'training/22.02.19-8239.txt', '00100375': 'evaluation/22.34.35-3190.txt', '00100377': 'evaluation/wsj_0006.txt', '00100278': 'development/21.07.32-24343.txt', '00100279': 'development/21.15.18-84.txt', '00100373': 'evaluation/20.31.28-6696.txt', '00100372': 'evaluation/23.34.07-17177.txt', '00100274': 'development/22.25.44-26373.txt', '00100275': 'development/08.40.56-18707.txt', '00100276': 'development/21.19.18-13373.txt', '00100277': 'development/18.26.05-8627.txt', '00100270': 'development/20.42.17-18974.txt', '00100271': 'development/13.05.15-16517.txt', '00100273': 'development/20.38.10-27533.txt', '00100172': 'training/20.42.07-7434.txt', '00100171': 'training/12.04.00-12904.txt', '00100170': 'training/wsj_0542.txt', '00100177': 'training/08.07.48-9357.txt', '00100176': 'training/23.03.25-11609.txt', '00100175': 'training/115CVL036.txt', '00100174': 'training/21.15.19-21938.txt', '00100179': 'training/wsj_0709.txt', '00100178': 'training/110CYL072.txt', '001001708': 'training/110CYL071.txt', '001001662': 'training/20.59.14-2538.txt', '001001661': 'training/20.40.49-25157.txt', '001001667': 'training/20.56.08-8888.txt', '001001665': 'training/12.15.47-5091.txt', '001001701': 'training/20.47.21-9712.txt', '001001704': 'training/00.59.03-19180.txt', '001001461': 'training/20.50.32-16604.txt', '001001463': 'training/21.29.08-21533.txt', '001001464': 'training/00.42.54-29681.txt', '001001467': 'training/21.38.12-11637.txt', '001001466': 'training/wsj_0816.txt', '001001469': 'training/20.32.14-18152.txt', '001001689': 'training/21.39.02-16166.txt', '001001688': 'training/05.22.13-11526.txt', '001001681': 'training/wsj_0927.txt', '001001680': 'training/22.21.24-5526.txt', '001001683': 'training/20.22.42-14586.txt', '001001687': 'training/21.16.59-4123.txt', '001001686': 'training/21.37.46-9337.txt', '001001553': 'training/21.45.32-26215.txt', '001001529': 'training/16.01.44-19040.txt', '001001258': 'training/112C-L014.txt', '001001569': 'training/20.47.43-22487.txt', '001001564': 'training/20.27.35-29256.txt', '001001566': 'training/21.10.24-16924.txt', '001001561': 'training/enron-thread-159550.txt', '001001562': 'training/21.16.53-1351.txt', '001001563': 'training/21.09.23-6361.txt', '001001205': 'training/20.40.44-29118.txt', '001002146': 'development/20.56.01-12278.txt', '001001369': 'training/21.00.52-4729.txt', '001002143': 'development/07.16.31-13271.txt', '001002142': 'development/chapter-10.txt', '001002141': 'development/110CYL070.txt', '001001363': 'training/wsj_0127.txt', '001001361': 'training/20.42.04-19290.txt', '001001366': 'training/15.36.10-18917.txt', '001002149': 'development/wsj_0356.txt', '001001351': 'training/wsj_0165.txt', '00100399': 'evaluation/sw2014-UTF16-ms98-a-trans.txt', '001003147': 'evaluation/12.16.50-28640.txt', '001003140': 'evaluation/21.18.30-8033.txt', '001003228': 'evaluation/12.19.24-26050.txt', '001001271': 'training/wsj_0187.txt', '001003225': 'evaluation/11.04.20-23621.txt', '001001798': 'training/20.41.07-12494.txt', '001003221': 'evaluation/22.23.51-4342.txt', '001001272': 'training/21.31.55-5725.txt', '00100395': 'evaluation/11.30.45-27115.txt', '00100394': 'evaluation/22.28.56-2635.txt', '001001797': 'training/20.58.58-18302.txt', '001001849': 'training/112C-L012.txt', '001001842': 'training/20.27.12-885.txt', '001001840': 'training/wsj_0907.txt', '001001847': 'training/20.21.49-25548.txt', '001001795': 'training/06.29.55-10258.txt', '001001845': 'training/20.33.06-778.txt', '001001844': 'training/06.29.16-13721.txt', '001001794': 'training/13.10.41-18948.txt', '001001371': 'training/20.29.29-13302.txt', '001001108': 'training/20.51.31-14776.txt', '001001109': 'training/08.11.35-9355.txt', '001001106': 'training/21.03.19-22060.txt', '001001107': 'training/20.56.28-3485.txt', '001001104': 'training/HistoryGreek.txt', '001001105': 'training/11.09.24-151.txt', '001001103': 'training/20.42.26-19148.txt', '001001100': 'training/20.49.26-27556.txt', '001001101': 'training/20000420_xin_eng-NEW.txt', '001003132': 'evaluation/18.02.46-24802.txt', '001003138': 'evaluation/110CYL069.txt', '001001150': 'training/21.20.15-24907.txt', '001001589': 'training/21.28.11-375.txt', '00100139': 'training/16.01.33-12919.txt', '00100138': 'training/20.22.56-6451.txt', '00100137': 'training/wsj_0068.txt', '00100134': 'training/18.09.35-13708.txt', '00100133': 'training/12.30.38-25095.txt', '001001442': 'training/20.49.23-18398.txt', '00100130': 'training/15.02.54-18922.txt', '001001225': 'training/21.00.11-67.txt', '001001747': 'training/wsj_0924.txt', '001001429': 'training/Article247_66.txt', '001001426': 'training/21.35.03-11178.txt', '001001421': 'training/21.50.57-15245.txt', '001001610': 'training/21.01.07-18921.txt', '001001423': 'training/NapierDianne.txt', '001001422': 'training/01.37.46-4752.txt', '00100219': 'development/20.56.51-26264.txt', '00100264': 'development/20.45.31-22188.txt', '00100345': 'evaluation/14.05.39-25211.txt', '00100340': 'evaluation/21.58.34-17613.txt', '00100343': 'evaluation/20.58.30-19320.txt', '00100222': 'development/23.37.40-9205.txt', '00100188': 'training/wsj_0551.txt', '00100189': 'training/06.48.28-6852.txt', '00100226': 'development/20.41.01-8736.txt', '00100180': 'training/20.42.01-25605.txt', '00100181': 'training/22.27.34-26526.txt', '00100185': 'training/20.33.21-17578.txt', '001001648': 'training/11.05.55-12013.txt', '001002103': 'development/21.20.46-20946.txt', '001001639': 'training/20.58.06-10542.txt', '001002101': 'development/20.38.16-13557.txt', '001001630': 'training/20.23.04-14788.txt', '001001631': 'training/12.22.23-26451.txt', '001001632': 'training/20.45.09-11809.txt', '001001634': 'training/23.25.24-13030.txt', '001001635': 'training/20.04.50-29091.txt', '001001636': 'training/21.27.48-10130.txt', '001001730': 'training/00.48.42-17806.txt', '001001733': 'training/15.01.41-17868.txt', '001001732': 'training/wsj_0026.txt', '001001735': 'training/20.34.02-2831.txt', '001001320': 'training/20.49.12-24038.txt', '001001736': 'training/20.52.31-12963.txt', '001001738': 'training/Article247_500.txt', '001001328': 'training/wsj_0376.txt', '001001329': 'training/23.17.57-23406.txt', '001001811': 'training/17.41.39-5995.txt', '001001812': 'training/17.55.28-22100.txt', '001001814': 'training/13.40.05-15087.txt', '001001815': 'training/06.51.18-1222.txt', '001001816': 'training/21.21.45-6259.txt', '001001819': 'training/21.35.18-3709.txt', '001001640': 'training/05.19.38-789.txt', '001001257': 'training/12.04.40-21590.txt', '00100298': 'development/10.03.26-15373.txt', '00100299': 'development/04.33.07-17094.txt', '00100294': 'development/20.23.00-9795.txt', '00100293': 'development/wsj_0706.txt', '00100290': 'development/23.12.58-1993.txt', '00100291': 'development/20.40.04-14266.txt', '001001887': 'training/17.55.10-20068.txt', '001001886': 'training/PolkMaria.txt', '001001557': 'training/20.42.51-22299.txt', '001001556': 'training/Article247_327.txt', '001001883': 'training/21.30.26-14869.txt', '00100148': 'training/CNN_ENG_20030614_173123.4-NEW-1.txt', '001001881': 'training/wsj_0660.txt', '001001558': 'training/23.55.20-21157.txt', '001001254': 'training/23.26.43-14352.txt', '001001888': 'training/08.15.19-23507.txt', '00100389': 'evaluation/12.33.55-762.txt', '001001482': 'training/114CUL057.txt', '001001671': 'training/20.31.22-12363.txt', '001001255': 'training/21.19.00-22108.txt', '001003178': 'evaluation/21.03.16-25474.txt', '001003177': 'evaluation/08.09.29-13319.txt', '001003176': 'evaluation/20.46.47-22286.txt', '001003171': 'evaluation/07.20.20-11694.txt', '001003170': 'evaluation/113CWL017.txt', '001003173': 'evaluation/07.52.36-18982.txt', '001003270': 'evaluation/sw2078-UTF16-ms98-a-trans.txt', '001003271': 'evaluation/20.59.38-14856.txt', '001003276': 'evaluation/21.03.10-21966.txt', '001003159': 'evaluation/20.23.24-6873.txt', '001003259': 'evaluation/20.36.20-26562.txt', '00100386': 'evaluation/21.45.56-26903.txt', '001001177': 'training/11.14.49-23456.txt', '001001176': 'training/22.23.24-9583.txt', '001001175': 'training/20000415_apw_eng-NEW.txt', '001001171': 'training/20.23.27-26526.txt', '001001170': 'training/A1.E2-NEW.txt', '001001178': 'training/22.36.40-5626.txt', '00100268': 'development/21.16.16-1134.txt', '001002144': 'development/20.01.03-24107.txt', '00100263': 'development/20.52.23-14674.txt', '00100262': 'development/13.08.06-1812.txt', '00100260': 'development/12.22.33-10622.txt', '00100144': 'training/602CZL285-1.txt', '001001289': 'training/20.41.01-29272.txt', '00100143': 'training/wsj_0160.txt', '001001285': 'training/20.54.40-10484.txt', '001001740': 'training/20.43.51-9328.txt', '001001287': 'training/wsj_0736.txt', '001001281': 'training/wsj_0811.txt', '001001280': 'training/23.39.56-18704.txt', '001001283': 'training/08.03.05-5625.txt', '001001282': 'training/wsj_1040.txt', '001001779': 'training/21.07.24-24231.txt', '001001778': 'training/21.07.24-28603.txt', '001001672': 'training/21.12.01-14811.txt', '001001775': 'training/20.33.43-387.txt', '001001774': 'training/wsj_0662.txt', '001001777': 'training/21.00.22-21144.txt', '001001776': 'training/AFGP-2002-602187-Trans.txt', '001001771': 'training/12.07.16-6586.txt', '001001471': 'training/20000419_apw_eng-NEW.txt', '001001476': 'training/21.04.32-17074.txt', '0010039': 'evaluation/21.17.41-25537.txt', '001001475': 'training/wsj_0189.txt', '0010037': 'evaluation/wsj_0679.txt', '0010036': 'evaluation/116CUL033.txt', '001001478': 'training/20.21.48-16389.txt', '001001479': 'training/08.14.55-18533.txt', '0010033': 'evaluation/21.24.32-9824.txt', '0010032': 'evaluation/21.04.31-28782.txt', '0010031': 'evaluation/21.50.32-3597.txt', '0010030': 'evaluation/20.45.53-22539.txt'} REAL_KEYS['negation'] = {'0020018': 'training/raw.txt', '0020028': 'development/raw.txt', '0020037': 'evaluation/raw.txt'} # UD Error is used when raising exceptions in this module class UDError(Exception): pass # Load given CoNLL-U file into internal representation def load_conllu(file): # Internal representation classes class UDRepresentation: def __init__(self): # Characters of all the tokens in the whole file. # Whitespace between tokens is not included. self.characters = [] # List of UDSpan instances with start&end indices into `characters`. self.tokens = [] # List of UDWord instances. self.words = [] # List of UDSpan instances with start&end indices into `characters`. self.sentences = [] class UDSpan: def __init__(self, start, end): self.start = start # Note that self.end marks the first position **after the end** of span, # so we can use characters[start:end] or range(start, end). self.end = end class UDWord: def __init__(self, span, columns, is_multiword): # Span of this word (or MWT, see below) within ud_representation.characters. self.span = span # 10 columns of the CoNLL-U file: ID, FORM, LEMMA,... self.columns = columns # is_multiword==True means that this word is part of a multi-word token. # In that case, self.span marks the span of the whole multi-word token. self.is_multiword = is_multiword # Reference to the UDWord instance representing the HEAD (or None if root). self.parent = None # List of references to UDWord instances representing functional-deprel children. self.functional_children = [] # Only consider universal FEATS. self.columns[FEATS] = "|".join(sorted(feat for feat in columns[FEATS].split("|") if feat.split("=", 1)[0] in UNIVERSAL_FEATURES)) # Let's ignore language-specific deprel subtypes. self.columns[DEPREL] = columns[DEPREL].split(":")[0] # Precompute which deprels are CONTENT_DEPRELS and which FUNCTIONAL_DEPRELS self.is_content_deprel = self.columns[DEPREL] in CONTENT_DEPRELS self.is_functional_deprel = self.columns[DEPREL] in FUNCTIONAL_DEPRELS ud = UDRepresentation() # Load the CoNLL-U file index, sentence_start = 0, None while True: line = file.readline() if not line: break line = line.rstrip("\r\n") # Handle sentence start boundaries if sentence_start is None: # Skip comments if line.startswith("#"): continue # Start a new sentence ud.sentences.append(UDSpan(index, 0)) sentence_start = len(ud.words) if not line: # Add parent and children UDWord links and check there are no cycles def process_word(word): if word.parent == "remapping": raise UDError("There is a cycle in a sentence") if word.parent is None: head = int(word.columns[HEAD]) if head < 0 or head > len(ud.words) - sentence_start: raise UDError("HEAD '{}' points outside of the sentence".format(word.columns[HEAD])) if head: parent = ud.words[sentence_start + head - 1] word.parent = "remapping" process_word(parent) word.parent = parent for word in ud.words[sentence_start:]: process_word(word) # func_children cannot be assigned within process_word # because it is called recursively and may result in adding one child twice. for word in ud.words[sentence_start:]: if word.parent and word.is_functional_deprel: word.parent.functional_children.append(word) # Check there is a single root node if len([word for word in ud.words[sentence_start:] if word.parent is None]) != 1: raise UDError("There are multiple roots in a sentence") # End the sentence ud.sentences[-1].end = index sentence_start = None continue # Read next token/word columns = line.split("\t") if len(columns) != 10: raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(line)) # Skip empty nodes if "." in columns[ID]: continue # Delete spaces from FORM, so gold.characters == system.characters # even if one of them tokenizes the space. Use any Unicode character # with category Zs. if sys.version_info < (3, 0) and isinstance(line, str): columns[FORM] = columns[FORM].decode("utf-8") columns[FORM] = "".join(filter(lambda c: unicodedata.category(c) != "Zs", columns[FORM])) if sys.version_info < (3, 0) and isinstance(line, str): columns[FORM] = columns[FORM].encode("utf-8") if not columns[FORM]: raise UDError("There is an empty FORM in the CoNLL-U file") # Save token ud.characters.extend(columns[FORM]) ud.tokens.append(UDSpan(index, index + len(columns[FORM]))) index += len(columns[FORM]) # Handle multi-word tokens to save word(s) if "-" in columns[ID]: try: start, end = map(int, columns[ID].split("-")) except: raise UDError("Cannot parse multi-word token ID '{}'".format(columns[ID])) for _ in range(start, end + 1): word_line = file.readline().rstrip("\r\n") word_columns = word_line.split("\t") if len(word_columns) != 10: raise UDError("The CoNLL-U line does not contain 10 tab-separated columns: '{}'".format(word_line)) ud.words.append(UDWord(ud.tokens[-1], word_columns, is_multiword=True)) # Basic tokens/words else: try: word_id = int(columns[ID]) except: raise UDError("Cannot parse word ID '{}'".format(columns[ID])) if word_id != len(ud.words) - sentence_start + 1: raise UDError("Incorrect word ID '{}' for word '{}', expected '{}'".format(columns[ID], columns[FORM], len(ud.words) - sentence_start + 1)) try: head_id = int(columns[HEAD]) except: raise UDError("Cannot parse HEAD '{}'".format(columns[HEAD])) if head_id < 0: raise UDError("HEAD cannot be negative") ud.words.append(UDWord(ud.tokens[-1], columns, is_multiword=False)) if sentence_start is not None: raise UDError("The CoNLL-U file does not end with empty line") return ud # Evaluate the gold and system treebanks (loaded using load_conllu). def evaluate(gold_ud, system_ud): class Score: def __init__(self, gold_total, system_total, correct, aligned_total=None): self.correct = correct self.gold_total = gold_total self.system_total = system_total self.aligned_total = aligned_total self.precision = correct / system_total if system_total else 0.0 self.recall = correct / gold_total if gold_total else 0.0 self.f1 = 2 * correct / (system_total + gold_total) if system_total + gold_total else 0.0 self.aligned_accuracy = correct / aligned_total if aligned_total else aligned_total class AlignmentWord: def __init__(self, gold_word, system_word): self.gold_word = gold_word self.system_word = system_word class Alignment: def __init__(self, gold_words, system_words): self.gold_words = gold_words self.system_words = system_words self.matched_words = [] self.matched_words_map = {} def append_aligned_words(self, gold_word, system_word): self.matched_words.append(AlignmentWord(gold_word, system_word)) self.matched_words_map[system_word] = gold_word def lower(text): if sys.version_info < (3, 0) and isinstance(text, str): return text.decode("utf-8").lower() return text.lower() def spans_score(gold_spans, system_spans): correct, gi, si = 0, 0, 0 while gi < len(gold_spans) and si < len(system_spans): if system_spans[si].start < gold_spans[gi].start: si += 1 elif gold_spans[gi].start < system_spans[si].start: gi += 1 else: correct += gold_spans[gi].end == system_spans[si].end si += 1 gi += 1 return Score(len(gold_spans), len(system_spans), correct) def alignment_score(alignment, key_fn=None, filter_fn=None): if filter_fn is not None: gold = sum(1 for gold in alignment.gold_words if filter_fn(gold)) system = sum(1 for system in alignment.system_words if filter_fn(system)) aligned = sum(1 for word in alignment.matched_words if filter_fn(word.gold_word)) else: gold = len(alignment.gold_words) system = len(alignment.system_words) aligned = len(alignment.matched_words) if key_fn is None: # Return score for whole aligned words return Score(gold, system, aligned) def gold_aligned_gold(word): return word def gold_aligned_system(word): return alignment.matched_words_map.get(word, "NotAligned") if word is not None else None correct = 0 for words in alignment.matched_words: if filter_fn is None or filter_fn(words.gold_word): if key_fn(words.gold_word, gold_aligned_gold) == key_fn(words.system_word, gold_aligned_system): correct += 1 return Score(gold, system, correct, aligned) def beyond_end(words, i, multiword_span_end): if i >= len(words): return True if words[i].is_multiword: return words[i].span.start >= multiword_span_end return words[i].span.end > multiword_span_end def extend_end(word, multiword_span_end): if word.is_multiword and word.span.end > multiword_span_end: return word.span.end return multiword_span_end def find_multiword_span(gold_words, system_words, gi, si): # We know gold_words[gi].is_multiword or system_words[si].is_multiword. # Find the start of the multiword span (gs, ss), so the multiword span is minimal. # Initialize multiword_span_end characters index. if gold_words[gi].is_multiword: multiword_span_end = gold_words[gi].span.end if not system_words[si].is_multiword and system_words[si].span.start < gold_words[gi].span.start: si += 1 else: # if system_words[si].is_multiword multiword_span_end = system_words[si].span.end if not gold_words[gi].is_multiword and gold_words[gi].span.start < system_words[si].span.start: gi += 1 gs, ss = gi, si # Find the end of the multiword span # (so both gi and si are pointing to the word following the multiword span end). while not beyond_end(gold_words, gi, multiword_span_end) or \ not beyond_end(system_words, si, multiword_span_end): if gi < len(gold_words) and (si >= len(system_words) or gold_words[gi].span.start <= system_words[si].span.start): multiword_span_end = extend_end(gold_words[gi], multiword_span_end) gi += 1 else: multiword_span_end = extend_end(system_words[si], multiword_span_end) si += 1 return gs, ss, gi, si def compute_lcs(gold_words, system_words, gi, si, gs, ss): lcs = [[0] * (si - ss) for i in range(gi - gs)] for g in reversed(range(gi - gs)): for s in reversed(range(si - ss)): if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): lcs[g][s] = 1 + (lcs[g+1][s+1] if g+1 < gi-gs and s+1 < si-ss else 0) lcs[g][s] = max(lcs[g][s], lcs[g+1][s] if g+1 < gi-gs else 0) lcs[g][s] = max(lcs[g][s], lcs[g][s+1] if s+1 < si-ss else 0) return lcs def align_words(gold_words, system_words): alignment = Alignment(gold_words, system_words) gi, si = 0, 0 while gi < len(gold_words) and si < len(system_words): if gold_words[gi].is_multiword or system_words[si].is_multiword: # A: Multi-word tokens => align via LCS within the whole "multiword span". gs, ss, gi, si = find_multiword_span(gold_words, system_words, gi, si) if si > ss and gi > gs: lcs = compute_lcs(gold_words, system_words, gi, si, gs, ss) # Store aligned words s, g = 0, 0 while g < gi - gs and s < si - ss: if lower(gold_words[gs + g].columns[FORM]) == lower(system_words[ss + s].columns[FORM]): alignment.append_aligned_words(gold_words[gs+g], system_words[ss+s]) g += 1 s += 1 elif lcs[g][s] == (lcs[g+1][s] if g+1 < gi-gs else 0): g += 1 else: s += 1 else: # B: No multi-word token => align according to spans. if (gold_words[gi].span.start, gold_words[gi].span.end) == (system_words[si].span.start, system_words[si].span.end): alignment.append_aligned_words(gold_words[gi], system_words[si]) gi += 1 si += 1 elif gold_words[gi].span.start <= system_words[si].span.start: gi += 1 else: si += 1 return alignment # Check that the underlying character sequences do match. if gold_ud.characters != system_ud.characters: index = 0 while index < len(gold_ud.characters) and index < len(system_ud.characters) and \ gold_ud.characters[index] == system_ud.characters[index]: index += 1 raise UDError( "The concatenation of tokens in gold file and in system file differ!\n" + "First 20 differing characters in gold file: '{}' and system file: '{}'".format( "".join(gold_ud.characters[index:index + 20]), "".join(system_ud.characters[index:index + 20]) ) ) # Align words alignment = align_words(gold_ud.words, system_ud.words) # Compute the F1-scores return { "Tokens": spans_score(gold_ud.tokens, system_ud.tokens), "Sentences": spans_score(gold_ud.sentences, system_ud.sentences), "Words": alignment_score(alignment), "UPOS": alignment_score(alignment, lambda w, _: w.columns[UPOS]), "XPOS": alignment_score(alignment, lambda w, _: w.columns[XPOS]), "UFeats": alignment_score(alignment, lambda w, _: w.columns[FEATS]), "AllTags": alignment_score(alignment, lambda w, _: (w.columns[UPOS], w.columns[XPOS], w.columns[FEATS])), "Lemmas": alignment_score(alignment, lambda w, ga: w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), "UAS": alignment_score(alignment, lambda w, ga: ga(w.parent)), "LAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL])), "CLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL]), filter_fn=lambda w: w.is_content_deprel), "MLAS": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[UPOS], w.columns[FEATS], [(ga(c), c.columns[DEPREL], c.columns[UPOS], c.columns[FEATS]) for c in w.functional_children]), filter_fn=lambda w: w.is_content_deprel), "BLEX": alignment_score(alignment, lambda w, ga: (ga(w.parent), w.columns[DEPREL], w.columns[LEMMA] if ga(w).columns[LEMMA] != "_" else "_"), filter_fn=lambda w: w.is_content_deprel), } def load_conllu_file(path): _file = open(path, mode="r", **({"encoding": "utf-8"} if sys.version_info >= (3, 0) else {})) return load_conllu(_file) def evaluate_wrapper(args): # Load CoNLL-U files gold_ud = load_conllu_file(args.gold_file) system_ud = load_conllu_file(args.system_file) return evaluate(gold_ud, system_ud) def round_score(score): return round(score * 100. / 5.) * 5. / 100. SPLITS = ['development', 'evaluation', 'training'] def check_file_count(task, file_counts, data_split): messages = [] events = {} opinion = {} negation = {} if data_split == 'real': events['training'] = 800 events['development'] = 150 events['evaluation'] = 260 opinion['training'] = 449 opinion['development'] = 90 opinion['evaluation'] = 148 negation['training'] = 1 negation['development'] = 1 negation['evaluation'] = 1 elif data_split == 'trial': events['training'] = 410 events['development'] = 83 events['evaluation'] = 0 opinion['training'] = 204 opinion['development'] = 49 opinion['evaluation'] = 0 negation['training'] = 1 negation['development'] = 1 negation['evaluation'] = 1 if task == "events": for k in file_counts.keys(): if len(file_counts[k]) != events[k]: messages.append('''The number of system files (%d) in %s's %s split doesn't match the original number of files (%d).''' % (len(file_counts[k]), task, k, events[k])) elif task == "opinion": for k in file_counts.keys(): if len(file_counts[k]) != opinion[k]: messages.append('''The number of system files (%d) in %s's %s split doesn't match the original number of files (%d).''' % (len(file_counts[k]), task, k, opinion[k])) elif task == "negation": for k in file_counts.keys(): if len(file_counts[k]) != negation[k]: messages.append('''The number of system files (%d) in %s's %s split doesn't match the original number of files (%d).''' % (len(file_counts[k]), task, k, negation[k])) return messages def unpack(infile, task, data_split): file_counts = {} extension = "." + infile.split('.')[-1] for split in SPLITS: file_counts[split] = [] if data_split == 'trial': key_filename = TRIAL_KEYS[task] elif data_split == 'real': key_filename = REAL_KEYS[task] lines = "" ignore = 0 delimiter = False with open(infile, 'r') as file: for line in file: if len(line) < 2 or line[0] == '#': ignore = len(line) lines += line delimiter = False elif line.split('\t')[1] in key_filename: p_filename = key_filename[line.split('\t')[1]] split = p_filename.split('/')[0] file_counts[split].append('.'.join(p_filename.split('/')[-1].split('.')[:-1]) + ".txt") # set delimiter to True to ignore the rest of the separator # line, i.e.: "ends." delimiter = True lines = "" ignore = 0 else: if not delimiter: lines += line # keep track of the length of the previous line to ignore it # when writing out the file; this is important to ignore the # token "Document" in the separator. ignore = len(line) return check_file_count(task, file_counts, data_split) TRIAL_FILES_SIZE = {'events': 781250, 'opinion': 733106, 'negation': 455702} REAL_FILES_SIZE = {'events': 1881915, 'opinion': 2169699, 'negation': 455702} def validate_file_size(input_path): events_size = stat(path.join(input_path, 'events.txt')).st_size opinion_size = stat(path.join(input_path, 'opinion.txt')).st_size if events_size == TRIAL_FILES_SIZE['events'] and opinion_size == TRIAL_FILES_SIZE['opinion']: return 'trial' elif events_size == REAL_FILES_SIZE['events'] and opinion_size == REAL_FILES_SIZE['opinion']: return 'real' else: return 'Oops the original file seems to have been modified.' class TestRoundScore(unittest.TestCase): def test_round_score(self): self.assertEqual(round_score(0.874999), 0.85) self.assertEqual(round_score(0.875001), 0.9) self.assertEqual(round_score(0.924), 0.9) self.assertEqual(round_score(0.924999), 0.9) self.assertEqual(round_score(0.925001), 0.95) self.assertEqual(round_score(0.93), 0.95) self.assertEqual(round_score(0.974999), 0.95) self.assertEqual(round_score(0.975001), 1.0) def main(): # Parse arguments parser = argparse.ArgumentParser() parser.add_argument("truth", type=str, help="Directory name of the truth dataset.") parser.add_argument("system", type=str, help="Directory name of system output.") args = parser.parse_args() # Load input dataset metadata.json with open(args.truth + "/metadata.json","r") as metadata_file: metadata = json.load(metadata_file) # Evaluate and compute sum of all treebanks treebanks = 0 results = [] for entry in metadata: treebanks += 1 #ltcode, goldfile, outfile = "_".join((entry['lcode'], entry['tcode'])), entry['goldfile'], entry['outfile'] ltcode, outfile = "_".join((entry['lcode'], entry['tcode'])), entry['outfile'] # Load gold data # try: # gold = load_conllu_file(args.truth + "/" + goldfile) # except: # results.append((ltcode+"-Status", "Error: Cannot load gold file")) # continue # Load system data try: system = load_conllu_file(args.system + "/" + outfile) except UDError as e: if e.args[0].startswith("There is a cycle"): results.append((ltcode+"-Status", "Error: There is a cycle in generated CoNLL-U file")) continue if e.args[0].startswith("There are multiple roots"): results.append((ltcode+"-Status", "Error: There are multiple roots in a sentence in generated CoNLL-U file")) continue results.append((ltcode+"-Status", "Error: There is a format error (tabs, ID values, etc) in generated CoNLL-U file")) continue except: results.append((ltcode+"-Status", "Error: Cannot open generated CoNLL-U file")) continue # Check for correctness if not system.characters: results.append((ltcode+"-Status", "Error: The system file is empty")) continue # if system.characters != gold.characters: # results.append((ltcode+"-Status", "Error: The concatenation of tokens in gold file and in system file differ, system file has {} nonspace characters, which is approximately {}% of the gold file".format(len(system.characters), int(100 * len(system.characters) / len(gold.characters))))) # continue data_split = validate_file_size(args.truth) if data_split != 'real' and data_split != 'trial': results.append((ltcode+"-Status", data_split)) else: for entry in metadata: err_messages = unpack(path.join(args.system, entry['outfile']), entry['rawfile'].split('.')[0], data_split) for message in err_messages: results.append((ltcode+"-Status", message)) for key, value in results: if not key.endswith("-Status"): continue ltcode = key[:-len("-Status")] print("{:13} {}".format(ltcode, value), file=sys.stderr) if __name__ == "__main__": main() # Tests, which can be executed with `python -m unittest conll18_ud_eval`. class TestAlignment(unittest.TestCase): @staticmethod def _load_words(words): """Prepare fake CoNLL-U files with fake HEAD to prevent multiple roots errors.""" lines, num_words = [], 0 for w in words: parts = w.split(" ") if len(parts) == 1: num_words += 1 lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, parts[0], int(num_words>1))) else: lines.append("{}-{}\t{}\t_\t_\t_\t_\t_\t_\t_\t_".format(num_words + 1, num_words + len(parts) - 1, parts[0])) for part in parts[1:]: num_words += 1 lines.append("{}\t{}\t_\t_\t_\t_\t{}\t_\t_\t_".format(num_words, part, int(num_words>1))) return load_conllu((io.StringIO if sys.version_info >= (3, 0) else io.BytesIO)("\n".join(lines+["\n"]))) def _test_exception(self, gold, system): self.assertRaises(UDError, evaluate, self._load_words(gold), self._load_words(system)) def _test_ok(self, gold, system, correct): metrics = evaluate(self._load_words(gold), self._load_words(system)) gold_words = sum((max(1, len(word.split(" ")) - 1) for word in gold)) system_words = sum((max(1, len(word.split(" ")) - 1) for word in system)) self.assertEqual((metrics["Words"].precision, metrics["Words"].recall, metrics["Words"].f1), (correct / system_words, correct / gold_words, 2 * correct / (gold_words + system_words))) def test_exception(self): self._test_exception(["a"], ["b"]) def test_equal(self): self._test_ok(["a"], ["a"], 1) self._test_ok(["a", "b", "c"], ["a", "b", "c"], 3) def test_equal_with_multiword(self): self._test_ok(["abc a b c"], ["a", "b", "c"], 3) self._test_ok(["a", "bc b c", "d"], ["a", "b", "c", "d"], 4) self._test_ok(["abcd a b c d"], ["ab a b", "cd c d"], 4) self._test_ok(["abc a b c", "de d e"], ["a", "bcd b c d", "e"], 5) def test_alignment(self): self._test_ok(["abcd"], ["a", "b", "c", "d"], 0) self._test_ok(["abc", "d"], ["a", "b", "c", "d"], 1) self._test_ok(["a", "bc", "d"], ["a", "b", "c", "d"], 2) self._test_ok(["a", "bc b c", "d"], ["a", "b", "cd"], 2) self._test_ok(["abc a BX c", "def d EX f"], ["ab a b", "cd c d", "ef e f"], 4) self._test_ok(["ab a b", "cd bc d"], ["a", "bc", "d"], 2) self._test_ok(["a", "bc b c", "d"], ["ab AX BX", "cd CX a"], 1)