package lkformat2;
import java.io.*;
import java.util.*;
public class PreprocessParser {
private static String extractAttribute(String line, String attr) {
String s = attr + "=\"";
int ix1 = line.indexOf(s);
if(ix1 == -1)
return null;
ix1 += s.length();
int ix2 = line.indexOf("\"", ix1);
return line.substring(ix1, ix2);
}
private static String extractEntityData(String line) {
int ix1 = line.indexOf("", ix1);
if(ix2 == -1)
return null;
int ix3 = line.lastIndexOf("");
if(ix3 == -1)
return null;
return line.substring(ix2 + 1, ix3);
}
public static void processFile(String fileName, PrintWriter out) {
try {
if(new File(fileName).isDirectory())
return;
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(fileName), "UTF-8"));
String line = br.readLine();
while(line != null) {
if(line.contains("provides=\"SENTENCES\""))
break;
line = br.readLine();
}
if(line == null)
return;
String tokenFile = extractAttribute(line, "scope");
if(tokenFile == null)
tokenFile = fileName;
//System.out.println("Sentences from " + fileName + ", tokens from " + tokenFile);
ArrayList spans = new ArrayList();
line = br.readLine();
while(!line.contains("")) {
String start = extractAttribute(line, "start");
String end = extractAttribute(line, "end");
//System.out.println("line = " + line + " start = " + start + " end = " + end);
if(start == null)
throw new RuntimeException("Only start-end annotation supported for sentences");
if(end == null)
throw new RuntimeException("Only start-end annotation supported for sentences");
if(start.charAt(0) != '#')
throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end);
if(end.charAt(0) != '#')
throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " start = " + start + " end = " + end);
start = start.substring(1);
end = end.substring(1);
spans.add(new String[] { start, end });
line = br.readLine();
}
if(spans.isEmpty())
return;
br.close();
br = new BufferedReader(new InputStreamReader(new FileInputStream(tokenFile), "UTF-8"));
line = br.readLine();
while(line != null) {
if(line.contains("provides=\"TOKENS\""))
break;
line = br.readLine();
}
int senPos = 0;
String[] senSpan = spans.get(senPos);
boolean inside = false;
int prev = 0;
ArrayList tokens = new ArrayList();
line = br.readLine();
while(!line.contains("")) {
line = line.trim();
if(!line.equals("")) {
String t = extractEntityData(line);
if(t == null)
throw new RuntimeException("Could not extract token");
String id = extractAttribute(line, "id");
if(id == null)
throw new RuntimeException("Could not extract id");
int idi = Integer.parseInt(id);
if(idi != prev + 1)
throw new RuntimeException("I have assumed contiguous ids...");
prev = idi;
if(id.equals(senSpan[0]))
inside = true;
if(inside) {
//out.println(t);
String[] ts = new String[4];
ts[0] = id;
ts[1] = t;
tokens.add(ts);
}
if(id.equals(senSpan[1])) {
senPos++;
if(senPos == spans.size())
break;
senSpan = spans.get(senPos);
inside = false;
}
}
line = br.readLine();
}
String posFile = tokenFile.replaceAll("\\.[^\\.]+\\.xml", ".pos.xml");
// TEMPORARY
//posFile = posFile.replaceFirst("solr-lkxml", "lk_output_new");
br.close();
br = new BufferedReader(new InputStreamReader(new FileInputStream(posFile), "UTF-8"));
//out.println("___BEGIN___|" + tokenFile);
//out.println();
out.print("0\t___BEGIN___|" + tokenFile + "\t");
out.print("_\t");
out.print("_\t");
out.print("_\t");
out.print("_\t");
out.print("___BEGIN___|" + tokenFile + "\t");
out.print("_\t");
out.print("0\t");
out.println("ROOT");
out.println();
line = br.readLine();
while(line != null) {
if(line.contains("provides=\"POS\""))
break;
line = br.readLine();
}
if(line == null)
throw new RuntimeException("No POS annotation found!");
line = br.readLine();
while(!line.contains("")) {
line = line.trim();
if(!line.equals("")) {
String t = extractEntityData(line);
if(t == null)
throw new RuntimeException("Could not extract token");
String on = extractAttribute(line, "on");
if(on == null)
throw new RuntimeException("Only on annotation supported for sentences");
if(on.charAt(0) != '#')
throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " on = " + on);
on = on.substring(1);
int position = Integer.parseInt(on) - 1;
String[] ts = tokens.get(position);
if(!on.equals(ts[0]))
throw new RuntimeException("!!!");
ts[2] = t;
}
line = br.readLine();
}
while(line != null) {
if(line.contains("provides=\"LEMMA\""))
break;
line = br.readLine();
}
line = br.readLine();
while(!line.contains("")) {
line = line.trim();
if(!line.equals("")) {
String t = extractEntityData(line);
if(t == null)
throw new RuntimeException("Could not extract token");
String on = extractAttribute(line, "on");
if(on == null)
throw new RuntimeException("Only on annotation supported for sentences");
if(on.charAt(0) != '#')
throw new RuntimeException("Only relative URIs supported for sentences: line = " + line + " on = " + on);
on = on.substring(1);
int position = Integer.parseInt(on) - 1;
String[] ts = tokens.get(position);
if(!on.equals(ts[0]))
throw new RuntimeException("!!!");
ts[3] = t;
}
line = br.readLine();
}
br.close();
senPos = 0;
senSpan = spans.get(senPos);
int posInSentence = 0;
for(String[] ts: tokens) {
//out.println(Arrays.toString(ts));
posInSentence++;
out.print(posInSentence + "\t");
out.print(ts[1] + "\t");
out.print(ts[3] + "\t");
out.print("_\t");
out.print(ts[2] + "\t");
out.print(ts[1] + "\t");
out.print(ts[3] + "\t");
out.print(ts[2] + "\t");
out.print("0\t");
out.println("ROOT");
if(ts[0].endsWith(senSpan[1])) {
out.println();
senPos++;
if(senPos == spans.size())
break;
senSpan = spans.get(senPos);
posInSentence = 0;
}
}
out.print("1\t___END___\t");
out.print("_\t");
out.print("_\t");
out.print("_\t");
out.print("___END___\t");
out.print("_\t");
out.print("_\t");
out.print("0\t");
out.println("ROOT");
out.println();
//out.println("___END___|" + tokenFile);
//out.println();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
public static void processDirectory(String dirName, String outFileName) {
try {
PrintWriter out = new PrintWriter(new FileWriter(outFileName));
String[] files = new File(dirName).list();
Arrays.sort(files);
for(String file: files) {
processFile(dirName + File.separatorChar + file, out);
}
out.close();
} catch(Exception e) {
e.printStackTrace();
System.exit(1);
}
}
public static void main(String[] argv) {
processDirectory(argv[0], argv[1]);
}
}