package mpqareader; import java.io.*; import java.util.*; import java.util.regex.*; import se.lth.cs.nlp.nlputils.annotations.*; import se.lth.cs.nlp.nlputils.core.Util; import se.lth.cs.nlp.nlputils.core.Strings; public class MPQAReader { private static final boolean FILTER_TAGS = true; static final Pattern PROP_PAT = Pattern.compile("([a-z\\-]+)=\"([^\"]*)\"\\s*"); static final Pattern TAG_PAT = Pattern.compile("<.*?>"); static final Pattern EXTENDED_TAG_PAT = Pattern.compile("(
(.|\\s)*?
)|(<.*?>)|\\.START|-----Original Message-----|((^|\\n)(From:|Sent:|To:|Date:|Subject:|DOCUMENT|TRANSLATION:|DATE:|Alias:|Marital status:|Message-ID:|Mime-Version:|Content-Type:|Content-Transfer-Encoding:|X-(.*?):).*)+"); static class MPQADirIterator implements Iterator { private String dirName; private File[] subdirs1; private File[] subdirs2; private int index1; private int index2; private boolean removeTags; private Iterator fileListIterator; MPQADirIterator(String dirName, boolean removeTags, Collection files) { this.dirName = dirName.substring(0, dirName.indexOf("database.mpqa.2.0")); if(files == null) { File dir = new File(dirName + File.separatorChar + "docs"); subdirs1 = dir.listFiles(); if(subdirs1 == null) throw new IllegalArgumentException("Could not read directory"); index1 = -1; index2 = -1; } else fileListIterator = files.iterator(); this.removeTags = removeTags; } MPQADirIterator(String dirName) { this(dirName, false, null); } /* TODO state checks */ //private AnnotatedText nextValue; public boolean hasNext() { if(fileListIterator != null) return fileListIterator.hasNext(); //if(nextValue != null) // return true; if(index2 == -1 || index2 == subdirs2.length - 1) { index1++; if(index1 >= subdirs1.length) return false; File subdir = subdirs1[index1]; subdirs2 = subdir.listFiles(); if(subdirs2 == null) throw new RuntimeException("could not list directory: " + subdir); if(subdirs2.length == 0) throw new RuntimeException("directory is empty"); index2 = 0; return true; } else { index2++; return true; } } private static char[] buf = new char[10000]; public AnnotatedText next() { try { File subfile2; if(fileListIterator == null) subfile2 = subdirs2[index2]; else subfile2 = new File(dirName + fileListIterator.next()); //System.out.println(dirName); //System.out.println(subfile2); //System.exit(0); if(subfile2.getAbsolutePath().contains("database.mpqa.2.0/docs/20020203/20.46.36-9539")) { warn("*** Warning: skipped " + subfile2); //System.exit(0); if(!hasNext()) throw new RuntimeException("!!!"); subfile2 = subdirs2[index2]; } //System.out.println(subfile2); BufferedReader br = new BufferedReader(new FileReader(subfile2)); /*StringBuilder sb = new StringBuilder(); String line = br.readLine(); while(line != null) { sb.append(line); sb.append("\n"); // osäker på windows här line = br.readLine(); } br.close();*/ StringBuilder sb = new StringBuilder(); while(true) { int n = br.read(buf); if(n == -1) break; sb.append(new String(buf, 0, n)); } String mpqaFileName = subfile2.getPath().replaceFirst(".*database.mpqa.2.0/docs", "database.mpqa.2.0/docs"); AnnotatedText text = new AnnotatedText(); text.setProperty("mpqa_file", mpqaFileName); text.text = sb.toString(); String annDir = subfile2.getPath().replaceFirst("docs", "man_anns"); String metaPath = subfile2.getPath().replaceFirst("docs", "meta_anns"); for(File annFile: new File(annDir).listFiles()) { AnnotationLayer l = new AnnotationLayer(); l.label = annFile.getName(); BufferedReader br2 = new BufferedReader(new FileReader(annFile)); String line2 = br2.readLine(); while(line2 != null) { line2 = line2.trim(); if(!line2.equals("") && !line2.startsWith("#")) { String[] ss = line2.split("\\t"); if(ss.length != 4 && ss.length != 5) { System.out.println("line2 = " + line2); System.out.println(Arrays.asList(ss)); throw new RuntimeException("ss.length != 5"); } if(!ss[2].equals("string")) throw new RuntimeException("ss[2] != string"); String[] ss2 = ss[1].split("\\,"); if(ss2.length != 2) throw new RuntimeException("ss2.length != 2"); int start = Integer.parseInt(ss2[0]); int end = Integer.parseInt(ss2[1]); Span span = new Span(start, end, ss[3]); span.id = ss[0]; if(ss.length == 5) { ss[4] = ss[4].trim(); if(ss[4].length() > 0) { Matcher m = PROP_PAT.matcher(ss[4]); int ix = 0; while(m.find()) { String k = m.group(1); String v = m.group(2); span.setProperty(k, v); if(m.start() != ix) throw new RuntimeException("!!!"); ix = m.end(); } if(ix != ss[4].length()) { throw new RuntimeException("!!!"); } /* String[] ss3 = ss[4].split("(?<=\"([^\"]*)\")\\s+"); for(String p: ss3) { String[] ss4 = p.split("="); if(ss4.length != 2) { System.out.println(Arrays.toString(ss)); System.out.println(ss[4]); System.out.println(Arrays.asList(ss4)); throw new RuntimeException("ss4.length != 2"); } if(!ss4[1].startsWith("\"") || !ss4[1].endsWith("\"")) { System.out.println(Arrays.toString(ss)); System.out.println(ss[4]); System.out.println(Arrays.asList(ss4)); throw new RuntimeException("unquoted"); } if(ss4[1].length() < 2) { System.out.println(Arrays.toString(ss)); System.out.println(ss[4]); System.out.println(Arrays.asList(ss4)); throw new RuntimeException("too short"); } span.setProperty(ss4[0], ss4[1].substring(1, ss4[1].length() - 1)); } */ } } l.add(span); } line2 = br2.readLine(); } text.layers.add(l); } File metaFile = new File(metaPath); if(metaFile.exists()) { BufferedReader br2 = new BufferedReader(new FileReader(metaFile)); String line2 = br2.readLine(); while(line2 != null) { if(!line2.startsWith("#")) { String[] ss = line2.split("\\t"); if(ss.length != 4 && ss.length != 5) { System.out.println("line2 = " + line2); System.out.println(Arrays.asList(ss)); throw new RuntimeException("ss.length != 5"); } String v = ss.length == 4? "": ss[4]; String k = ss[3].substring(5); if(k.matches("region|subregion|country|topic")) { ArrayList l = (ArrayList) text.getProperty(k); if(l == null) { l = new ArrayList(); text.setProperty(k, l); } l.add(v); } else { String v2 = (String) text.getProperty(k); if(v2 != null) throw new RuntimeException("property " + k + " already set"); text.setProperty(k, v); } } line2 = br2.readLine(); } } String oldText = text.text; if(FILTER_TAGS) { //text.replaceAll("<(.*?)>", ""); } Collections.sort(text.layers, new Comparator() { public int compare(AnnotationLayer o1, AnnotationLayer o2) { return -o1.label.compareTo(o2.label); } }); if(true) { // fix two offset bugs if(mpqaFileName.equals("database.mpqa.2.0/docs/xbank/wsj_0583")) { for(AnnotationLayer l: text.layers) for(Span s: l) if(s.start >= 2268) { s.start -= 3; s.end -= 3; warn("Fixing error in wsj_0583"); } /* try { text.splitLayers(); for(AnnotationLayer l: text.layers) { RawXMLAnnotation.instance().printLayers(System.out, text.text, Collections.singletonList(l)); } } catch(Exception e) { e.printStackTrace(); System.exit(1); } System.exit(0); */ } else if(mpqaFileName.equals("database.mpqa.2.0/docs/20020203/20.46.36-9539")) { throw new RuntimeException("This text is buggy and cannot be fixed"); } else if(mpqaFileName.equals("database.mpqa.2.0/docs/ula/Article247_66")) { for(Iterator it = text.layers.get(0).iterator(); it.hasNext(); ) { Span s = it.next(); if(s.start == 880 && s.end == 894) { it.remove(); break; } } } else if(mpqaFileName.equals("database.mpqa.2.0/docs/20020221/21.03.10-21966")) { for(AnnotationLayer l: text.layers) for(Span s: l) if(s.start == 641) s.start++; //System.out.println(text.layers); } else if(mpqaFileName.equals("database.mpqa.2.0/docs/ula/im_401b_e73i32c22_031705-2")) { /*for(AnnotationLayer l: text.layers) for(Span s: l) if(s.start == 1286 || s.start == 1709 || s.start == 1827) s.start++;*/ //System.out.println(text.layers); } } if(removeTags) removeTags(text); tokenize(text); Collections.sort(text.layers.get(0).spans, Span.ByLeftOrder.instance()); //for(Span sp: (AnnotationLayer) text.layers.get(0)) // System.out.println(sp); Span prevSen = null; for(Span sentence: (AnnotationLayer) text.layers.get(0)) { if(!sentence.label.equals("GATE_sentence")) throw new RuntimeException("not sentence"); if(sentence.tokenStart == -1 && sentence.start >= 0) sentence.tokenStart = 0; if(sentence.tokenEnd == -1 && sentence.end >= 0) //sentence.tokenEnd = 0; sentence.tokenEnd = sentence.tokenStart; if(false && (sentence.tokenStart < 0 || sentence.tokenStart >= text.tokens.length)) { System.out.println("---"); System.out.println("Old text:"); System.out.println(oldText); System.out.println("---"); System.out.println("New text:"); System.out.println(text.text); System.out.println("---"); System.out.println("Sentence snippet: |" + text.text.substring(sentence.start, sentence.end) + "|"); System.out.flush(); for(int i = 0; i < text.tokens.length; i++) { System.out.println(i + " |" + text.tokens[i] + "|"); } throw new RuntimeException("1. sentence = " + sentence + ", tokens.length = " + text.tokens.length); } if(sentence.tokenEnd < sentence.tokenStart) { //for(Span sp: (AnnotationLayer) text.layers.get(0)) // System.out.println(sp); throw new RuntimeException("negative"); } if(sentence.tokenEnd > sentence.tokenStart && sentence.tokenEnd > text.tokens.length) { for(int i = 0; i < text.tokens.length; i++) { System.out.println(i + "\t" + text.tokens[i]); } System.out.println("sen = |" + text.text.substring(sentence.start, sentence.end) + "|"); throw new RuntimeException("2. sentence = " + sentence); } if(prevSen != null && sentence.tokenEnd > sentence.tokenStart) { if(sentence.start < prevSen.end) throw new RuntimeException("sentences not ordered [char]"); if(sentence.tokenStart < prevSen.tokenEnd) { System.err.println(mpqaFileName); String s1 = text.text.substring(sentence.start, sentence.end); String s2 = text.text.substring(prevSen.start, prevSen.end); System.err.println("|" + s1 + "|"); System.err.println("|" + s2 + "|"); for(int i = prevSen.tokenStart; i < prevSen.tokenEnd; i++) { System.err.println(text.tokens[i]); } System.err.println(); for(int i = sentence.tokenStart; i < sentence.tokenEnd; i++) { System.err.println(text.tokens[i]); } try { RawXMLAnnotation.instance().printLayers(System.err, text.text, Collections.singletonList(text.layers.get(0))); } catch(Exception e) { e.printStackTrace(); System.exit(1); } throw new RuntimeException("sentences not ordered [token] " + prevSen + ", " + sentence); } } if(sentence.tokenEnd > sentence.tokenStart) prevSen = sentence; } return text; } catch(IOException e) { throw new RuntimeException(e); } } private void removeTags(AnnotatedText text) { String oldText = text.text; StringBuffer sb = new StringBuffer(); Matcher m = EXTENDED_TAG_PAT.matcher(text.text); while(m.find()) { String repl = m.group(); repl = repl.replaceAll(".", " "); m.appendReplacement(sb, repl); } m.appendTail(sb); text.text = sb.toString(); /*if(!text.text.equals(oldText)) { System.out.println("***** Including tags *****"); System.out.println(oldText); System.out.println("***** Removed tags *****"); System.out.println(text.text); }*/ } public void remove() { throw new UnsupportedOperationException("unsupported"); } } private static void warn(Object msg) { if(false) System.err.println(msg); } public static Iterator processDirectory(String dirName) { return new MPQADirIterator(dirName); } public static Iterator processDirectory(String dirName, Collection selectedFiles) { return new MPQADirIterator(dirName, true, selectedFiles); } public static void main0(String[] argv) { try { String dirName = argv[0]; File dir = new File(dirName + File.separatorChar + "docs"); PrintWriter pw = new PrintWriter(System.out); for(File subdir: dir.listFiles()) { for(File subfile2: subdir.listFiles()) { System.out.println(subfile2); //BufferedReader br = new BufferedReader(new FileReader(subfile2)); /*StringBuilder sb = new StringBuilder(); String line = br.readLine(); while(line != null) { sb.append(line); sb.append("\n"); // osäker på windows här line = br.readLine(); } br.close();*/ final int BUF_SIZE = 10000; byte[] buf = new byte[BUF_SIZE]; ByteArrayOutputStream bos = new ByteArrayOutputStream(); InputStream bis = new BufferedInputStream(new FileInputStream(subfile2)); int n = bis.read(buf); while(n != -1) { bos.write(buf); n = bis.read(buf); } AnnotatedText text = new AnnotatedText(); text.text = new String(bos.toByteArray()); // sb.toString(); String annDir = subfile2.getPath().replaceFirst("docs", "man_anns"); String metaPath = subfile2.getPath().replaceFirst("docs", "meta_anns"); for(File annFile: new File(annDir).listFiles()) { System.out.println(annFile); AnnotationLayer l = new AnnotationLayer(); l.label = annFile.getName(); BufferedReader br2 = new BufferedReader(new FileReader(annFile)); String line2 = br2.readLine(); while(line2 != null) { line2 = line2.trim(); if(!line2.equals("") && !line2.startsWith("#")) { String[] ss = line2.split("\\t"); if(ss.length != 4 && ss.length != 5) { System.out.println("line2 = " + line2); System.out.println(Arrays.asList(ss)); throw new RuntimeException("ss.length != 5"); } if(!ss[2].equals("string")) throw new RuntimeException("ss[2] != string"); String[] ss2 = ss[1].split("\\,"); if(ss2.length != 2) throw new RuntimeException("ss2.length != 2"); int start = Integer.parseInt(ss2[0]); int end = Integer.parseInt(ss2[1]); Span span = new Span(start, end, ss[3]); span.id = ss[0]; if(ss.length == 5) { ss[4] = ss[4].trim(); if(ss[4].length() > 0) { Matcher m = PROP_PAT.matcher(ss[4]); int ix = 0; while(m.find()) { String k = m.group(1); String v = m.group(2); span.setProperty(k, v); if(m.start() != ix) throw new RuntimeException("!!!"); ix = m.end(); } if(ix != ss[4].length()) { throw new RuntimeException("!!!"); } /* String[] ss3 = ss[4].split("(?<=\"([^\"]*)\")\\s+"); for(String p: ss3) { String[] ss4 = p.split("="); if(ss4.length != 2) { System.out.println(Arrays.toString(ss)); System.out.println(ss[4]); System.out.println(Arrays.asList(ss4)); throw new RuntimeException("ss4.length != 2"); } if(!ss4[1].startsWith("\"") || !ss4[1].endsWith("\"")) { System.out.println(Arrays.toString(ss)); System.out.println(ss[4]); System.out.println(Arrays.asList(ss4)); throw new RuntimeException("unquoted"); } if(ss4[1].length() < 2) { System.out.println(Arrays.toString(ss)); System.out.println(ss[4]); System.out.println(Arrays.asList(ss4)); throw new RuntimeException("too short"); } span.setProperty(ss4[0], ss4[1].substring(1, ss4[1].length() - 1)); } */ } } l.add(span); } line2 = br2.readLine(); } text.layers.add(l); } File metaFile = new File(metaPath); if(metaFile.exists()) { BufferedReader br2 = new BufferedReader(new FileReader(metaFile)); String line2 = br2.readLine(); while(line2 != null) { if(!line2.startsWith("#")) { String[] ss = line2.split("\\t"); if(ss.length != 4 && ss.length != 5) { System.out.println("line2 = " + line2); System.out.println(Arrays.asList(ss)); throw new RuntimeException("ss.length != 5"); } String v = ss.length == 4? "": ss[4]; String k = ss[3].substring(5); if(k.matches("region|subregion|country|topic")) { ArrayList l = (ArrayList) text.getProperty(k); if(l == null) { l = new ArrayList(); text.setProperty(k, l); } l.add(v); } else { String v2 = (String) text.getProperty(k); if(v2 != null) throw new RuntimeException("property " + k + " already set"); text.setProperty(k, v); } } line2 = br2.readLine(); } } //System.out.println("Text done:"); //System.out.println(text); //RawXMLAnnotation.instance().printLayers(System.out, text); tokenize(text); //text.findTokenIndices(); text.splitLayers(); //ColumnSpanAnnotation.instance().printLayers(pw, text); //System.exit(0); //System.out.println("Layers:"); for(AnnotationLayer l: text.layers) { System.out.println(l.label); List ls = Collections.singletonList(l); try { RawXMLAnnotation.instance().printLayers(text.text, ls); } catch(Exception e) { System.out.println("*** Exception: " + e); System.out.println(l); } } //System.exit(0); //for(String t: text.tokens) { // System.out.println(t); //} } } } catch(Exception e) { e.printStackTrace(); } } private static String escapeTags(String text) { StringBuffer sb = new StringBuffer(); Matcher m = TAG_PAT.matcher(text); while(m.find()) { String repl = m.group(); repl = repl.replaceAll(".", "~"); m.appendReplacement(sb, repl); } m.appendTail(sb); return sb.toString(); } private static void tokenize(AnnotatedText text) { //System.out.println(text.properties); //System.out.println(text.text); String oldText = text.text; text.text = escapeTags(text.text); if(false && text.text.contains("~")) { System.out.println(oldText); System.out.println("***"); System.out.println("raw text = " + text.text); System.out.println("---"); System.exit(0); } for(Span sen: (AnnotationLayer) text.layers.get(0)) { if(!sen.label.equals("GATE_sentence")) throw new RuntimeException("not sentence: " + sen); int start = sen.start; int end = sen.end; if(start > 0 && Character.isLetter(text.text.charAt(start-1)) && Character.isLetter(text.text.charAt(start))) { //if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/xbank/wsj_0583")) // break; //if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/20020203/20.46.36-9539")) // break; System.err.println("*** WARNING: " + text.getProperty("mpqa_file") + " problematic"); String ss = text.text.substring(start, end); System.out.println("This sentence: |" + ss + "|"); try { RawXMLAnnotation.instance().printLayers(System.out, text.text, Collections.singletonList(text.layers.get(0))); } catch(Exception e) { e.printStackTrace(); System.exit(1); } System.exit(0); break; } } /*if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/xbank/wsj_0583")) { text.splitLayers(); //RawXMLAnnotation.instance().printLayers(text); try { RawXMLAnnotation.instance().printLayers(System.out, text.text, Collections.singletonList(text.layers.get(0))); } catch(Exception e) { e.printStackTrace(); System.exit(1); } try { RawXMLAnnotation.instance().printLayers(System.out, text.text, Collections.singletonList(text.layers.get(1))); } catch(Exception e) { e.printStackTrace(); System.exit(1); } System.exit(0); }*/ AnnotationLayer senLayer = text.layers.get(0); Collections.sort(senLayer.spans, Span.NestingOrder.instance()); ArrayList out = new ArrayList(); ArrayList ss = new ArrayList(); Span prevSen = null; for(Span sen: (AnnotationLayer) senLayer) { if(prevSen != null) { if(sen.start < prevSen.end) { System.out.println(senLayer); System.out.println(sen.start + ", " + prevSen.end); for(Span sen2: (AnnotationLayer) senLayer) { System.out.println(sen2.start + ", " + sen2.end + " |" + text.text.substring(sen2.start, sen2.end) + "|"); } throw new RuntimeException("Sentences not ordered..."); } } String senStr = text.text.substring(sen.start, sen.end); //System.out.println("senStr = |" + senStr + "|"); String[] ts = tokenizeSentence(senStr); for(String t: ts) out.add(t); int position = getNext(text.text, sen.start); //sen.start; for(int i = 0; i < ts.length; i++) { int[] p = new int[2]; p[0] = position; p[1] = getEnd(text.text, ts[i], position); position = getNext(text.text, p[1]); ss.add(p); } prevSen = sen; } text.tokens = out.toArray(new String[0]); int[] starts = new int[text.tokens.length]; int[] ends = new int[text.tokens.length]; for(int i = 0; i < text.tokens.length; i++) { starts[i] = ss.get(i)[0]; ends[i] = ss.get(i)[1]; } text.setProperty("token-starts", starts); text.setProperty("token-ends", ends); /* System.out.println("starts = " + Arrays.toString(starts)); System.out.println("ends = " + Arrays.toString(ends)); System.out.println("starts.length = " + starts.length); */ /*int position = senLayer.spans.get(0).start; for(int i = 0; i < text.tokens.length; i++) { starts[i] = position; ends[i] = getEnd(text.text, text.tokens[i], position); position = getNext(text.text, ends[i]); }*/ if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/AFGP-2002-600045-Trans")) { for(int i = 0; i < text.tokens.length; i++) { String t = text.text.substring(starts[i], ends[i]); warn(i + "\t|" + text.tokens[i] + "|\t|" + t + "|" + "\t" + starts[i] + "\t" + ends[i]); } } for(AnnotationLayer l: text.layers) for(Span s: l) { int st = s.start; while(st < s.end && st < text.text.length() && Character.isWhitespace(text.text.charAt(st))) st++; int en = s.end; while(en > st && en < text.text.length() && Character.isWhitespace(text.text.charAt(en - 1))) en--; if(false && text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/AFGP-2002-600045-Trans")) { if(st != s.start || en != s.end) { System.err.println("Modified span: (" + s.label + ")"); String t0 = text.text.substring(s.start, s.end); String t1 = text.text.substring(st, en); System.err.println("|" + t0 + "|"); System.err.println("|" + t1 + "|"); System.err.println("st = " + st + ", en = " + en); } } boolean debug = false; /*if(text.getProperty("mpqa_file").equals("database.mpqa.2.0/docs/ula/AFGP-2002-600045-Trans")) if(s.label.equals("GATE_sentence") && s.start == 3805) debug = true;*/ s.tokenStart = findIndexBSearch(starts, st, debug); // TODO probably a bug using "debug" here! //s.tokenEnd = findIndexBSearch(ends, s.end) + 1; if(st < en && s.tokenStart < starts.length) { s.tokenEnd = findIndexBSearch(ends, en, debug) + 1; if(s.tokenEnd > text.tokens.length) s.tokenEnd = text.tokens.length; if(s.tokenEnd < s.tokenStart) { //System.out.println("*** HERE ***: " + s); s.tokenEnd = text.tokens.length; } } else { s.tokenEnd = s.tokenStart; } } String tmp = text.text; text.text = oldText; boolean saw = false; for(int i = 0; i < text.tokens.length; i++) { if(text.tokens[i].startsWith("~~")) { text.tokens[i] = text.text.substring(starts[i], ends[i]); warn("Replaced: |" + text.tokens[i] + "|"); saw = true; } } if(false && saw) { System.out.println("*** CHECK ***"); System.out.println("|" + tmp + "|"); System.out.println("|" + text.text + "|"); for(int i = 0; i < text.tokens.length; i++) { String t = text.text.substring(starts[i], ends[i]); System.out.printf("%d\t%s\t%s\n", i, text.tokens[i], t); } System.exit(0); } /*for(int i = 0; i < text.tokens.length; i++) { System.out.printf("%d\t%s\n", i, text.tokens[i]); }*/ /* for(AnnotationLayer l: text.layers) { System.out.println("Layer: " + l.label); for(Span s: l) { System.out.print(s.label + ": (" + s.tokenStart + ", " + s.tokenEnd + ")"); System.out.print(" (" + s.start + ", " + s.end + ")"); String subs; try { subs = text.text.substring(s.start, s.end); } catch(Exception e) { subs = ""; } System.out.println(" |" + subs + "|"); } }*/ /*for(int i = 0; i < text.tokens.length; i++) { System.out.printf("%d\t%s %d %d\n", i, text.tokens[i], starts[i], ends[i]); }*/ } private static void checkSpan(Span s, AnnotatedText text) { if(s.label.equals("GATE_split")) return; /*if(s.tokenStart == -1) { System.err.println(s); System.err.println("|" + text.text.substring(s.start, s.end) + "|"); System.err.println(Arrays.toString((int[]) text.getProperty("token-starts"))); System.err.println(Arrays.toString((int[]) text.getProperty("token-ends"))); } if(s.tokenEnd == -1) { System.err.println(s); }*/ String s1 = text.text.substring(s.start, s.end); s1 = s1.replaceAll("\\s+", ""); StringBuilder sb2 = new StringBuilder(); for(int i = s.tokenStart; i < s.tokenEnd && i < text.tokens.length; i++) sb2.append(text.tokens[i]); String s2 = sb2.toString(); s2 = s2.replaceAll("``", "\""); s2 = s2.replaceAll("''", "\""); if(!s1.equals(s2)) { System.err.println("|" + s1 + "|"); System.err.println("|" + s2 + "|"); throw new RuntimeException("!!"); } } private static int findIndexBSearch(int[] arr, int c, boolean isStart) { if(arr.length == 0) return -1; if(c < arr[0]) { if(isStart) return -1; else return 0; } if(c > arr[arr.length - 1]) { if(isStart) return arr.length - 1; else return -1; } //return arr.length - 1; // ??? int low = 0, high = arr.length - 1, mid = (low + high) / 2; while(low < mid && mid < high) { if(arr[mid] == c) return mid; if(arr[low] == c) return low; if(arr[high] == c) return high; if(c < arr[mid]) high = mid; else low = mid; mid = (low + high) / 2; } //System.out.println("2. low = " + low + ", mid = " + mid + ", high = " + high); if(low < high) { if(arr[low] == c) return low; if(arr[high] == c) return high; // TODO testa if(isStart) return low; else return high; } return mid; } private static void testBSearch() { int[] starts = {0, 4, 8}; int[] ends = {3, 7, 11}; Object[][] tests = { { -1, -1, true }, { 0, 0, true }, { 4, 1, true }, { 8, 2, true }, { 1, 0, true }, { 2, 0, true }, { 3, 0, true }, // !! gränsfall! vad vill jag ha? { 5, 1, true }, { 6, 1, true }, { 7, 1, true }, // !! { 9, 2, true }, { 10, 2, true }, { 11, 2, true }, // !! { 12, 2, true }, // !?? { 3, 0, false }, { 7, 1, false }, { 11, 2, false }, { 0, 0, false }, // !?? { 1, 0, false }, { 2, 0, false }, { 4, 1, false }, // !! { 5, 1, false }, { 6, 1, false }, { 8, 2, false }, // !! { 9, 2, false }, { 10, 2, false }, { 12, -1, false }, // !?? }; for(int i = 0; i < tests.length; i++) { int charIndex = (Integer) tests[i][0]; int tokenIndex = (Integer) tests[i][1]; boolean isStart = (Boolean) tests[i][2]; int[] arr = isStart? starts: ends; int ix = findIndexBSearch(arr, charIndex, isStart); if(ix != tokenIndex) throw new RuntimeException("error " + i + ": ix = " + ix); } starts = new int[] { 0, 4 }; ends = new int[] { 3, 7 }; tests = new Object[][] { { -1, -1, true }, { 0, 0, true }, { 4, 1, true }, { 1, 0, true }, { 2, 0, true }, { 3, 0, true }, // !! gränsfall! vad vill jag ha? { 5, 1, true }, { 6, 1, true }, { 7, 1, true }, // !! { 3, 0, false }, { 7, 1, false }, { 0, 0, false }, // !?? { 1, 0, false }, { 2, 0, false }, { 4, 1, false }, // !! { 5, 1, false }, { 6, 1, false }, { 8, -1, false }, // !?? }; for(int i = 0; i < tests.length; i++) { int charIndex = (Integer) tests[i][0]; int tokenIndex = (Integer) tests[i][1]; boolean isStart = (Boolean) tests[i][2]; int[] arr = isStart? starts: ends; int ix = findIndexBSearch(arr, charIndex, isStart); if(ix != tokenIndex) throw new RuntimeException("error " + i + ": ix = " + ix); } } private static String[] tokenizeSentence(String s) { s = s.replaceAll("[\n\t\r]", " "); //System.out.println("s = |" + s + "|"); if(s.trim().equals("")) return new String[0]; // RJ s = s.replaceAll("(~(~+))", " $1 "); //# attempt to get correct directional quotes // s/^"/`` /g; s = s.replaceAll("^\"", "`` "); //s/([ \([{<])"/$1 `` /g; // # close quotes handled at end s = s.replaceAll("([ \\(\\[{<])\"", "$1 `` "); //s/\.\.\./ ... /g; s = s.replaceAll("\\.\\.\\.", " ... "); //s/[,;:@#$%&]/ $& /g; // RJ changed, removed comma s = s.replaceAll("[;:@#$%&]", " $0 "); // RJ s = s.replaceAll("([^ ]), ", "$1 , "); s = s.replaceAll("([^0-9 ]),([^0-9 ])", "$1 , $2"); //# Assume sentence tokenization has been done first, so split FINAL periods //# only. //s/([^.])([.])([\])}>"']*)[ \t]*$/$1 $2$3 /g; s = s.replaceAll("([^.])([.])([\\])}>\"']*)[ \\t]*$", "$1 $2$3"); //# however, we may as well split ALL question marks and exclamation points, //# since they shouldn't have the abbrev.-marker ambiguity problem //s/[?!]/ $& /g; s = s.replaceAll("[?!]", " $0 "); //# parentheses, brackets, etc. //s/[\]\[\(\){}\<\>]/ $& /g; s = s.replaceAll("[\\]\\[\\(\\){}\\<\\>]", " $0 "); //# Some taggers, such as Adwait Ratnaparkhi's MXPOST, use the parsed-file //# version of these symbols. //# UNCOMMENT THE FOLLOWING 6 LINES if you're using MXPOST. //s/\(/-LRB-/g; //s/\)/-RRB-/g; //s/\[/-LSB-/g; //s/\]/-RSB-/g; //s/{/-LCB-/g; //s/}/-RCB-/g; /* s = s.replaceAll("\\(", "-LRB-"); s = s.replaceAll("\\(", "-RRB-"); s = s.replaceAll("\\[", "-LSB-"); s = s.replaceAll("\\]", "-RSB-"); s = s.replaceAll("\\{", "-LCB-"); s = s.replaceAll("\\}", "-RCB-"); */ //s/--/ -- /g; s = s.replaceAll("--", " -- "); //# NOTE THAT SPLIT WORDS ARE NOT MARKED. Obviously this isn't great, since //# you might someday want to know how the words originally fit together -- //# but it's too late to make a better system now, given the millions of //# words we've already done "wrong". //# First off, add a space to the beginning and end of each line, to reduce //# necessary number of regexps. s = " " + s + " "; //s/"/ '' /g; s = s.replaceAll("\"", " '' "); // RJ från MPQA s = s.replaceAll("([^ `])`([^ `])", "$1'$2"); s = s.replaceAll("`([^ `])", "` $1"); s = s.replaceAll("([^ ])''", "$1 ''"); //RJ s = s.replaceAll(" '([^ '])", " ` $1"); s = s.replaceAll(" ` s ", " 's "); //# possessive or close-single-quote //s/([^'])' /$1 ' /g; s = s.replaceAll("([^'])' ", "$1 ' "); //# as in it's, I'm, we'd //s/'([sSmMdD]) / '$1 /g; //s/'ll / 'll /g; //s/'re / 're /g; //s/'ve / 've /g; //s/n't / n't /g; //s/'LL / 'LL /g; //s/'RE / 'RE /g; //s/'VE / 'VE /g; //s/N'T / N'T /g; s = s.replaceAll("'([sSmMdD]) ", " '$1 "); s = s.replaceAll("'ll ", " 'll "); s = s.replaceAll("'re ", " 're "); s = s.replaceAll("'ve ", " 've "); s = s.replaceAll("n't ", " n't "); s = s.replaceAll("'LL ", " 'LL "); s = s.replaceAll("'RE ", " 'RE "); s = s.replaceAll("'VE ", " 'VE "); s = s.replaceAll("N'T ", " N'T "); //s/ ([Cc])annot / $1an not /g; //s/ ([Dd])'ye / $1' ye /g; //s/ ([Gg])imme / $1im me /g; //s/ ([Gg])onna / $1on na /g; //s/ ([Gg])otta / $1ot ta /g; //s/ ([Ll])emme / $1em me /g; //s/ ([Mm])ore'n / $1ore 'n /g; //s/ '([Tt])is / '$1 is /g; //s/ '([Tt])was / '$1 was /g; //s/ ([Ww])anna / $1an na /g; //# s/ ([Ww])haddya / $1ha dd ya /g; //# s/ ([Ww])hatcha / $1ha t cha /g; s = s.replaceAll(" ([Cc])annot ", " $1an not "); s = s.replaceAll(" ([Dd])'ye ", " $1' ye "); s = s.replaceAll(" ([Gg])imme ", " $1im me "); s = s.replaceAll(" ([Gg])onna ", " $1on na "); s = s.replaceAll(" ([Gg])otta ", " $1ot ta "); s = s.replaceAll(" ([Ll])emme ", " $1em me "); s = s.replaceAll(" ([Mm])ore'n ", " $1ore 'n "); s = s.replaceAll(" '([Tt])is ", " '$1 is "); s = s.replaceAll(" '([Tt])was ", " '$1 was "); s = s.replaceAll(" ([Ww])anna ", " $1an na "); s = s.replaceAll("", ""); // # clean out extra spaces //s/ */ /g; //s/^ *//g; s = s.replaceAll(" *", " "); s = s.trim(); //System.out.println("|" + s + "|"); //System.out.flush(); return s.split(" "); } private static int getNext(String lkText, int position) { while(position < lkText.length() && Character.isWhitespace(lkText.charAt(position))) position++; return position; } private static int getEnd(String lkText, String token, int position) { //String line) { int len = Math.min(lkText.length() - position, token.length()); //System.out.println("position = " + position); //System.out.println("len = " + len); String t2 = lkText.substring(position, position + len); if(!token.equals(t2)) { if(token.matches("``|''") && t2.startsWith("\"")) { len = 1; t2 = "\""; } else if(token.equals("`") && t2.startsWith("'")) { len = 1; t2 = "'"; } else if(token.replaceAll("`", "'").equals(t2.replaceAll("`", "'"))) { len = token.length(); } else { //System.out.println(line); System.out.println(getContext(lkText, position)); System.out.println("position = " + position); System.out.println("ss = " + lkText.substring(position)); throw new RuntimeException(token + " != " + t2); } } //System.out.println(token); position += len; //while(position < lkText.length() && Character.isWhitespace(lkText.charAt(position))) // position++; //System.out.println("returning " + position); return position; } private static String getContext(String text, int pos) { int start = Math.max(0, pos - 100); int end = Math.min(text.length(), pos + 100); return text.substring(start, end); } public static void main1(String[] argv) { Iterator i = processDirectory(argv[0]); while(i.hasNext()) { AnnotatedText t = i.next(); System.out.println(t.getProperty("mpqa_file")); for(int ii = 0; ii < t.tokens.length; ii++) { System.out.printf("%d\t%s\n", ii, t.tokens[ii]); } } } public static void main3(String[] argv) { String s = Strings.join(argv, " "); System.out.println("|" + s + "|"); String[] ts = tokenizeSentence(s); for(String t: ts) System.out.println(t); } public static void main4(String[] argv) { Iterator i = processDirectory(argv[0]); int count = 0; int countSentences = 0; while(i.hasNext()) { count++; AnnotatedText text = i.next(); for(AnnotationLayer l: text.layers) for(Span s: l) if(s.label.equals("GATE_sentence")) countSentences++; } System.out.println(count); System.out.println(countSentences); } public static void main5(String[] argv) { Iterator i = processDirectory(argv[0]); ArrayList l = new ArrayList(); int count = 0; while(i.hasNext()) { count++; AnnotatedText text = i.next(); String f = (String) text.getProperty("mpqa_file"); f = f.replaceAll("database.mpqa.2.0/docs/", ""); l.add(f); } Random rand = new Random(0); Collections.shuffle(l, rand); final int N = 5; ArrayList[] ls = new ArrayList[N]; for(int ii = 0; ii < N; ii++) ls[ii] = new ArrayList(); int ix = 0; while(!l.isEmpty()) { String f = l.remove(l.size() - 1); ls[ix].add(f); ix = (ix + 1) % N; } for(int ii = 0; ii < N; ii++) { String outfile = "folds/fold" + ii; try { PrintWriter pw = new PrintWriter(new FileWriter(outfile)); Collections.sort(ls[ii]); for(String f: ls[ii]) pw.println(f); pw.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } } public static void main6(String[] argv) { Iterator i = processDirectory(argv[0]); while(i.hasNext()) { AnnotatedText t = i.next(); if(t.getProperty("mpqa_file").toString().contains("12.55.04-23296")) { Util.printFileBytes((String) t.getProperty("mpqa_file")); System.out.println("---"); for(int ii = 0; ii < t.text.length(); ii++) { char c = t.text.charAt(ii); String cs = c < ' '? "": ("" + c); System.out.format("%d\t%s\n", ii, cs); } System.out.println("---"); for(AnnotationLayer l: t.layers) { System.out.println("Layer: " + l.label); for(Span s: l) { String ps = s.properties == null? "(null)": s.properties.toString(); String ss = s.start <= s.end? t.text.substring(s.start, s.end): ""; System.out.format("%d %d %d %d %s %s %s\n", s.start, s.end, s.tokenStart, s.tokenEnd, "|" + ss + "|", s.label, ps); } } break; } } } public static void main7(String[] argv) { try { Iterator i = processDirectory(argv[0]); //PrintWriter pw = new PrintWriter(new FileWriter("mpqa_wsj.txt")); //System.out.println("Processing files."); while(i.hasNext()) { AnnotatedText t = i.next(); //System.out.println(t.getProperty("mpqa_file")); String filename = (String)t.getProperty("mpqa_file"); filename = filename.replaceAll(".*/", ""); if(!filename.contains("wsj")) continue; AnnotationLayer l = new AnnotationLayer(); AnnotationLayer l0 = t.layers.get(1); for(Span s: l0.spans) { if(s.label.matches("GATE_(objective-speech-event|direct-subjective|expressive-subjectivity)") && s.start < s.end) l.spans.add(s); } ArrayList ls = new ArrayList(); ls.add(l); System.out.println(""); RawXMLAnnotation.instance().printLayers(System.out, t.text, ls); } } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void main(String[] argv) { //main6(argv); main7(argv); //testBSearch(); } }