// -*- Mode: JavaScript; tab-width: 4; indent-tabs-mode: nil; -*- // vim:set ft=javascript ts=4 sw=4 sts=4 cindent: /* CoNLL-U format library for JavaScript. Home: http://github.com/spyysalo/conllu.js Format: http://universaldependencies.github.io/docs/format.html Author: Sampo Pyysalo License: MIT (http://opensource.org/licenses/MIT) */ var ConllU = (function(window, undefined) { /* * ConllU.Document: represents CoNLL-U document */ var Document = function() { this.reset(); }; Document.prototype.reset = function() { this.sentences = []; this.error = false; this.logger = function(s) { /* no-op */ }; this.strict = null; // pick heuristically }; Document.prototype.log = function(message) { this.logger(message); }; Document.prototype.logError = function(message) { this.log('error: ' + message); this.error = true; }; /* Parse CoNLL-U format, return Document. * (see http://universaldependencies.github.io/docs/format.html) * * CoNLL-U files contain three types of lines: * 1. Word lines * 2. Blank lines marking sentence boundaries * 3. Comment lines starting with a hash ("#") * * Each word line has the following format * 1. ID: Word index, integer starting at 1 for each new sentence; * may be a range for tokens with multiple words. * 2. FORM: Word form or punctuation symbol. * 3. LEMMA: Lemma or stem of word form. * 4. CPOSTAG: Google universal part-of-speech tag from the * Universal POS tag set. * 5. POSTAG: Language-specific part-of-speech tag; underscore * if not available. * 6. FEATS: List of morphological features from the Universal * feature inventory or from a defined language-specific extension; * underscore if not available. * 7. HEAD: Head of the current token, which is either a value of ID * or zero (0). * 8. DEPREL: Universal Stanford dependency relation to the HEAD * (root iff HEAD = 0) or a defined language-specific subtype * of one. * 9. DEPS: List of secondary dependencies (head-deprel pairs). * 10. MISC: Any other annotation. */ Document.prototype.parse = function(input, logger, strict) { // discard previous state, if any this.reset(); if (logger !== undefined) { this.logger = logger; } if (strict !== undefined) { this.strict = strict; } // TODO: handle other newline formats var lines = input.split('\n'); if (this.strict === null) { this.strict = selectParsingMode(input, this.logger); } // select splitter to use for dividing the lines into fields. var splitter = selectFieldSplitter(input, this.logger, this.strict); var elements = [], comments = [], beforeSentence = true; for (var idx=0; idxSTYLE", where REF // is either a single ID (for a span), a space-separated // ID1 ID2 TYPE triple (for a relation), or a special // wildcard value like "arcs", and STYLE is either // a colon-separated key-value pair or a color. m = styleSpec.match(/^([^\t]+)\s+(\S+)\s*$/); if (!m) { // TODO: avoid console.log console.log('warning: failed to parse: "'+comment+'"'); continue; } var reference = m[1], style = m[2]; // split style into key and value, adding a key to // color-only styles as needed for the reference type. var key, value; m = style.match(/^(\S+):(\S+)$/); if (m) { key = m[1]; value = m[2]; } else { value = style; if (reference === 'arcs' || reference.indexOf(' ') !== -1) { key = 'color'; } else { key = 'bgColor'; } } // store wildcards for separate later processing if (reference.match(/^(nodes|arcs)$/)) { wildcards.push([reference, key, value]); continue; } // adjust every ID in reference for brat if (reference.indexOf(' ') === -1) { reference = this.id + '-T' + reference; } else { reference = reference.split(' '); reference[0] = this.id + '-T' + reference[0]; reference[1] = this.id + '-T' + reference[1]; } styles.push([reference, key, value]); } // for expanding wildcards, first determine which words / arcs // styles have already been set, and then add the style to // everything that hasn't. var setStyle = {}; for (var i=0; i 10) { logger('repair: discarding fields > 10'); fields = fields.slice(0, 10); } else { logger('repair: filling in empty ("_") for missing fields'); for (var m=0; m<10-fields.length; m++) { fields.push('_'); } } }; var strictFieldSplitter = function(line) { // strict CoNLL format parsing: only split on TAB, no extra space. if (line.length === 0) { return []; } else { return line.split('\t'); } } var looseFieldSplitter = function(line) { // loose CoNLL format parsing: split on any space sequence, trim // surrounding space. line = line.trim(); if (line.length === 0) { return []; } else { return line.split(/\s+/); } } var selectParsingMode = function(conll, log) { // return whether to use strict mode parsing // very simple heuristic: any TABs in the input trigger // strict parsing, loose only if none present. if (conll.indexOf('\t') !== -1) { log('note: TAB found, parsing CoNLL-U in strict mode.') return true; } else { log('note: no TAB found, parsing CoNLL-U in loose mode.') return false; } }; var selectFieldSplitter = function(conll, log, strict) { // return function to use for dividing lines into fields. if (strict) { return strictFieldSplitter; } else { return looseFieldSplitter; } }; var isComment = function(line) { return line.length !== 0 && line[0] === '#'; }; var hasSpace = function(s) { return !!s.match(/\s/); }; var nullLogger = function(message) { return null; } /* * Return true iff given string only contains characters from a * right-to-left Unicode block and is not empty. */ var isRtl = function(s) { // range from http://stackoverflow.com/a/14824756 return !!s.match(/^[\u0591-\u07FF\uFB1D-\uFDFD\uFE70-\uFEFC]+$/); }; /* * Return given token with possible modifications to accommodate * issues in brat rendering of right-to-left text * (https://github.com/UniversalDependencies/docs/issues/52) */ var rtlFix = function(s) { var prefix = '\u02D1', suffix = '\u02D1'; if (isRtl(s)) { s = prefix + s + suffix; } return s; }; /* * Return a deep copy of the given object. Note: not particularly * efficient, and all fields must be serializable for this to work * correctly. */ var deepCopy = function(o) { return JSON.parse(JSON.stringify(o)); }; /* * Regular expressions for various parts of the format. * See https://github.com/UniversalDependencies/docs/issues/33 */ // match single (feature, value[s]) pair in FEATS var featureRegex = /^([A-Z0-9][a-zA-Z0-9]*(?:\[[a-z0-9]+\])?)=([A-Z0-9][a-zA-Z0-9]*(?:,[A-Z0-9][a-zA-Z0-9]*)*)$/; // match single feature value in FEATS var featureValueRegex = /^[A-Z0-9][a-zA-Z0-9]*$/; // match single (head, deprel) pair in DEPS var dependencyRegex = /^(\d+):(.*)$/; return { Document: Document, Sentence: Sentence, Element: Element, }; })(window);