package lkformat;
import java.io.*;
import java.util.*;
//import java.util.regex.*;
public class SSTToLK {
private static final String ENCODING = "UTF-8";
public static void main(String[] argv) {
String lkTextFile = argv[0];
String sstOutputFile = argv[1];
/*String outTokenFile = argv[2];
String outPOSFile = argv[3];
String outSSTFile = argv[4];
String outCoNLL2008File = argv[5];*/
String basename = lkTextFile.replaceAll("\\.lktext\\.xml", "");
String outTokenFile = basename + ".tokens.xml";
String outPOSFile = basename + ".pos.xml";
String outSSTFile = basename + ".sst.xml";
String outCoNLL2008File = basename + ".conll08in";
try {
String lkText = readLKText(lkTextFile);
BufferedReader br = new BufferedReader(new FileReader(sstOutputFile));
PrintWriter tokenOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outTokenFile), ENCODING));
PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outPOSFile), ENCODING));
PrintWriter sstOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outSSTFile), ENCODING));
PrintWriter tabularCoNLL08Out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outCoNLL2008File), ENCODING));
tokenOut.println("");
tokenOut.println("");
tokenOut.println("");
tokenOut.println(" " + lkTextFile + "");
tokenOut.println(" SSTLight");
tokenOut.println("");
tokenOut.println("");
posOut.println("");
posOut.println("");
posOut.println("");
posOut.println(" " + lkTextFile + "");
posOut.println(" SSTLight");
posOut.println("");
posOut.println("");
ArrayList sentenceEnds = new ArrayList();
ArrayList lemmas = new ArrayList();
ArrayList ssIOB = new ArrayList();
ArrayList conll03IOB = new ArrayList();
ArrayList wsjIOB = new ArrayList();
int tokenIdCounter = 0;
int position = 0;
String line = br.readLine();
while(line != null) {
line = line.trim();
if(!line.equals("")) {
String[] ss = line.split(" ");
if(ss.length % 6 != 0)
throw new RuntimeException("this line: |" + line + "|");
int n = ss.length / 6;
for(int i = 0; i < n; i++) {
String token = ss[6*i];
String pos = ss[6*i + 1];
String lemma = ss[6*i + 2];
lemmas.add(lemma);
tabularCoNLL08Out.print((i + 1) + "\t");
tabularCoNLL08Out.print(token + "\t");
tabularCoNLL08Out.print(lemma + "\t");
tabularCoNLL08Out.print("_\t");
tabularCoNLL08Out.print(pos + "\t");
tabularCoNLL08Out.print(token + "\t");
tabularCoNLL08Out.print(lemma + "\t");
tabularCoNLL08Out.print(pos + "\t");
tabularCoNLL08Out.print("0\t");
tabularCoNLL08Out.println("ROOT");
if(i == 0)
for(int j = 3; j < 6; j++)
if(ss[6*i + j].startsWith("I-"))
ss[6*i + j] = "B-" + ss[6*i + j].substring(2);
ssIOB.add(ss[6*i + 3]);
conll03IOB.add(ss[6*i + 4]);
wsjIOB.add(ss[6*i + 5]);
int id = ++tokenIdCounter;
int start = position;
int end = getEnd(lkText, token, position, line);
printEntity(token, start, end - 1, id, tokenOut);
printEntity(pos, id, -1, id, posOut);
position = getNext(lkText, end);
}
sentenceEnds.add(tokenIdCounter);
tabularCoNLL08Out.println();
}
line = br.readLine();
}
int nTokens = tokenIdCounter;
tokenOut.println("");
tokenOut.println("");
int prev = 1;
for(Integer i: sentenceEnds) {
StringBuilder sb = new StringBuilder(" ");
tokenOut.println(sb);
prev = i + 1;
}
tokenOut.println("");
tokenOut.println("");
tokenOut.close();
posOut.println("");
posOut.println("");
for(int i = 0; i < lemmas.size(); i++) {
String lemma = lemmas.get(i);
int tid = i + 1;
int id = tid + nTokens;
printEntity(lemma, tid, -1, id, posOut);
}
posOut.println("");
posOut.println("");
posOut.close();
sstOut.println("");
sstOut.println("");
sstOut.println("");
sstOut.println(" " + lkTextFile + "");
sstOut.println(" SSTLight");
sstOut.println("");
String preamble = "";
int ssid = 0;
ssid = printIOB(sstOut, preamble + "WNSS" + endString, ssid, nTokens, ssIOB);
ssid = printIOB(sstOut, preamble + "NE-CONLL03" + endString, ssid, nTokens, conll03IOB);
ssid = printIOB(sstOut, preamble + "NE-WSJ" + endString, ssid, nTokens, wsjIOB);
sstOut.println("");
sstOut.close();
tabularCoNLL08Out.close();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
private static String readLKText(String file) throws IOException {
BufferedReader br = new BufferedReader(new FileReader(file));
StringBuilder sb = new StringBuilder();
String line = br.readLine();
boolean started = false;
while(line != null) {
if(!started) {
if(line.startsWith("".length()));
sb.append("\n");
}
} else {
if(line.startsWith("");
else
sb.append("\" on=\"#" + start + "\">");
sb.append(l);
sb.append("");
out.println(sb);
}
private static int getNext(String lkText, int position) {
while(position < lkText.length() && Character.isWhitespace(lkText.charAt(position)))
position++;
return position;
}
private static int getEnd(String lkText, String token, int position,
String line) {
int len = Math.min(lkText.length() - position, token.length());
//System.out.println("position = " + position);
//System.out.println("len = " + len);
String t2 = lkText.substring(position, position + len);
if(!token.equals(t2)) {
if(token.matches("``|''") && t2.startsWith("\"")) {
len = 1;
t2 = "\"";
} else {
System.out.println(line);
System.out.println(getContext(lkText, position));
throw new RuntimeException(token + " != " + t2);
}
}
//System.out.println(token);
position += len;
//while(position < lkText.length() && Character.isWhitespace(lkText.charAt(position)))
// position++;
//System.out.println("returning " + position);
return position;
}
private static int printIOB(PrintWriter sstOut, String preamble,
int ssid, int nTokens, ArrayList iob) {
sstOut.println(preamble);
String openTag = null;
int openTagStart = -1;
for(int i = 0; i < iob.size(); i++) {
String tag = iob.get(i);
if(!tag.equals("0") && !tag.startsWith("B-") && !tag.startsWith("I-"))
throw new RuntimeException("Illegal tag " + tag);
int tid = i + 1;
if(openTag != null) {
String t = tag.equals("0")? null: tag.substring(2);
if(t == null || tag.startsWith("B") || !t.equals(openTag)) {
int id = ++ssid;
printEntity(openTag, openTagStart, tid-1, id, sstOut);
openTag = null;
}
}
if(!tag.equals("0")) {
if(openTag == null) {
openTag = tag.substring(2);
openTagStart = tid;
} else {
if(!tag.substring(2).equals(openTag))
throw new RuntimeException("Illegal tag here");
}
}
//printEntity(lemma, tid, -1, id, posOut);
}
if(openTag != null) {
int id = ++ssid;
printEntity(openTag, openTagStart, nTokens, id, sstOut);
openTag = null;
}
sstOut.println("");
return ssid;
}
}