package lkformat2; import java.io.*; import java.util.*; import java.util.regex.*; import se.lth.cs.nlp.depsrl.format.*; import se.lth.cs.nlp.nlputils.depgraph.*; import se.lth.cs.nlp.nlputils.core.*; import gnu.trove.*; public class CoNLL2008ToLK3 { private static final String ENCODING = "UTF-8"; public static void main(String[] argv) { String lkDir = argv[0]; String conll2008File = argv[1]; String outDir = argv[2]; System.out.println("argv = " + Arrays.toString(argv)); try { //Scanner conll2008Input = new Scanner(new InputStreamReader(new FileInputStream(conll2008File), ENCODING)); BufferedReader conll2008Input = new BufferedReader(new InputStreamReader(new FileInputStream(conll2008File), ENCODING)); String[] files = new File(lkDir).list(); Arrays.sort(files); for(String file: files) { if(file.endsWith("lktext.xml")) processFile(lkDir + File.separatorChar + file, conll2008Input, outDir); } conll2008Input.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } static void processFile(String textFile, /*Scanner*/BufferedReader conll2008Input, String outDir) { try { textFile = textFile.replaceAll("[^/]*/", ""); System.out.println("textFile = " + textFile); //String line = conll2008Input.nextLine(); String line = conll2008Input.readLine(); if(line == null || !line.startsWith("1\t___BEGIN___")) { System.out.println("line = " + line); throw new RuntimeException("Expected beginning of file..."); } String tokenFile = line.substring("0\t___BEGIN___|".length()); tokenFile = tokenFile.replaceAll("\t.*", ""); System.out.println("Before: " + tokenFile); BufferedReader tInput = new BufferedReader(new FileReader(tokenFile)); tokenFile = tokenFile.replaceAll("[^/]*/", ""); String basename = outDir + File.separatorChar + tokenFile.replaceAll("\\.[^\\.]+\\.xml", ""); String depOutFile = basename + ".depsyntax.xml"; String paOutFile = basename + ".predargs.xml"; PrintWriter depOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(depOutFile), ENCODING)); PrintWriter paOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(paOutFile), ENCODING)); //conll2008Input.nextLine(); conll2008Input.readLine(); System.out.println(tokenFile); TIntArrayList ids = new TIntArrayList(); Pattern pattern = Pattern.compile("id=\"(.*?)\""); line = tInput.readLine(); boolean started = false; while(line != null) { line = line.trim(); if(line.equals("")) { // skip } if(!started && line.startsWith(""); depOut.println(""); depOut.println(""); depOut.println(" " + textFile + ""); depOut.println(" LTH-DEP-SRL"); depOut.println(""); depOut.println(""); paOut.println(""); paOut.println(""); paOut.println(""); paOut.println(" " + textFile + ""); paOut.println(" LTH-DEP-SRL"); paOut.println(""); paOut.println(""); int depIndex = 0; TIntIntHashMap predPosToId = new TIntIntHashMap(); int paIdCounter = 0; ArrayList>> triples = new ArrayList(); Triple> t = CoNLL2008Format.readNextGraph(conll2008Input); while(t != null) { DepGraph dg = t.first; //System.out.println("1."); //CoNLL2008Format.printPAStructures(t.first, t.second, t.third); printDepGraph(depOut, dg, depIndex, ids); paIdCounter = printPreds(paOut, t.third, depIndex, ids, paIdCounter, predPosToId); depIndex += dg.nodes.length - 1; depOut.println(); triples.add(t); t = CoNLL2008Format.readNextGraph(conll2008Input); //System.out.println("2."); //CoNLL2008Format.printPAStructures(t.first, t.second, t.third); /*if(t.second.nodes.length == 2) { CoNLL2008Format.printPAStructures(t.first, t.second, t.third); }*/ if(t.first.nodes.length == 2 && t.first.nodes[1].word.startsWith("___END___")) { //System.out.println("Found end."); break; } } depOut.println(""); depOut.println(""); depOut.close(); paOut.println(""); paOut.println(""); depIndex = 0; for(Triple> tr: triples) { paIdCounter = printArgs(paOut, tokenFile, tr.third, depIndex, ids, paIdCounter, predPosToId); depIndex += tr.first.nodes.length - 1; } paOut.println(""); paOut.println(""); paOut.close(); tInput.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void main_old(String[] argv) { String lkTextFile = argv[0]; String lkTokenFile = argv[1]; String conll2008File = argv[2]; String basename = lkTokenFile.replaceAll("\\.tokens\\.xml", ""); String depOutFile = basename + ".depsyntax.xml"; String paOutFile = basename + ".predargs.xml"; try { BufferedReader tInput = new BufferedReader(new FileReader(lkTokenFile)); Scanner conllInput = new Scanner(new FileReader(conll2008File)); PrintWriter depOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(depOutFile), ENCODING)); PrintWriter paOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(paOutFile), ENCODING)); TIntArrayList ids = new TIntArrayList(); Pattern pattern = Pattern.compile("id=\"(.*?)\""); String line = tInput.readLine(); boolean started = false; while(line != null) { line = line.trim(); if(line.equals("")) { // skip } if(!started && line.startsWith(""); depOut.println(""); depOut.println(""); depOut.println(" " + lkTextFile + ""); depOut.println(" LTH-DEP-SRL"); depOut.println(""); depOut.println(""); paOut.println(""); paOut.println(""); paOut.println(""); paOut.println(" " + lkTextFile + ""); paOut.println(" LTH-DEP-SRL"); paOut.println(""); paOut.println(""); int depIndex = 0; TIntIntHashMap predPosToId = new TIntIntHashMap(); int paIdCounter = 0; Triple> t = CoNLL2008Format.readNextGraph(conllInput); while(t != null) { DepGraph dg = t.first; printDepGraph(depOut, dg, depIndex, ids); paIdCounter = printPreds(paOut, t.third, depIndex, ids, paIdCounter, predPosToId); depIndex += dg.nodes.length - 1; depOut.println(); t = CoNLL2008Format.readNextGraph(conllInput); } depOut.println(""); depOut.println(""); depOut.close(); paOut.println(""); paOut.println(""); conllInput = new Scanner(new FileReader(conll2008File)); depIndex = 0; t = CoNLL2008Format.readNextGraph(conllInput); while(t != null) { paIdCounter = printArgs(paOut, lkTokenFile, t.third, depIndex, ids, paIdCounter, predPosToId); depIndex += t.first.nodes.length - 1; t = CoNLL2008Format.readNextGraph(conllInput); } paOut.println(""); paOut.println(""); paOut.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static void printDepGraph(PrintWriter out, DepGraph dg, int depIndexStart, TIntArrayList ids) { for(int i = 1; i < dg.nodes.length; i++) { DepNode n = dg.nodes[i]; if(n.parents.length != 1) throw new IllegalArgumentException("Only single-head dependency trees allowed yet"); DepNode p = dg.nodes[i].parents[0]; int childIndex = depIndexStart + n.position - 1; int childTokenId = ids.get(childIndex); if(p.position == 0) { out.println(" " + n.relations[0] + ""); } else { int parentIndex = depIndexStart + p.position - 1; int parentTokenId = ids.get(parentIndex); out.println(" " + n.relations[0] + ""); } } } private static int printPreds(PrintWriter out, List pas, int depIndexStart, TIntArrayList ids, int paIdCounter, TIntIntHashMap predPosToId) { for(PAStructure pa: pas) { int id = ++paIdCounter; int predPosAbsolute = depIndexStart + pa.pred.position - 1; int tokenId = ids.get(predPosAbsolute); predPosToId.put(predPosAbsolute, id); out.println(" " + pa.lemma + ""); } out.println(); return paIdCounter; } private static int printArgs(PrintWriter out, String tokenFileName, List pas, int depIndexStart, TIntArrayList ids, int paIdCounter, TIntIntHashMap predPosToId) { for(PAStructure pa: pas) { int predPosAbsolute = depIndexStart + pa.pred.position - 1; //int predId = predPosToId.get(predPosAbsolute); int predId = ids.get(predPosAbsolute); for(int i = 0; i < pa.args.size(); i++) { int id = ++paIdCounter; DepNode arg = pa.args.get(i); String argLabel = pa.argLabels.get(i); int argPosAbsolute = depIndexStart + arg.position - 1; int tokenId = ids.get(argPosAbsolute); //out.println(" " // + argLabel + ""); out.println(" " + argLabel + ""); } } out.println(); return paIdCounter; } }