package lkformat2; import java.io.*; import java.util.*; //import java.util.regex.*; //import mpqareader.SubjectivityLexicon; public class POSTagsToLK { private static final String ENCODING = "UTF-8"; public static void main(String[] argv) { String lkDir = argv[0]; String sstOutputFile = argv[1]; String conll2008OutFile = argv[2]; //String subjExpInputFile = argv[3]; String outDir = argv[3]; String dict = argv[4]; //String subjLexFile = argv[6]; //System.out.println("argv = " + Arrays.toString(argv)); try { readDict(dict); //SubjectivityLexicon subjLex = new SubjectivityLexicon(subjLexFile); BufferedReader taggerInput = new BufferedReader(new InputStreamReader(new FileInputStream(sstOutputFile), ENCODING)); PrintWriter conll2008Out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(conll2008OutFile), ENCODING)); //PrintWriter subjExpOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(subjExpInputFile), ENCODING)); String[] files = new File(lkDir).list(); Arrays.sort(files); for(String file: files) { if(file.endsWith("lktext.xml")) processFile(lkDir + File.separatorChar + file, taggerInput, conll2008Out, outDir); } taggerInput.close(); conll2008Out.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } public static void processFile(String textFile, BufferedReader taggerInput, PrintWriter tabularCoNLL08Out, String outDir) { try { textFile = textFile.replaceAll("[^/]*/", ""); System.out.println("Reading tags for base text |" + textFile + "|"); String line = taggerInput.readLine(); if(line == null) throw new RuntimeException("Expected beginning of file, but came to end"); if(!line.startsWith("___BEGIN___")) throw new RuntimeException("Expected beginning of file: line = |" + line + "|"); String tokenFile = line.substring("___BEGIN___|".length()); tokenFile = tokenFile.replaceAll("\t.*", ""); System.out.println("Token file is |" + tokenFile + "|"); tabularCoNLL08Out.print("1\t___BEGIN___|" + tokenFile + "\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("___BEGIN___|" + tokenFile + "\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("0\t"); tabularCoNLL08Out.println("ROOT"); //tabularCoNLL08Out.println(); tokenFile = tokenFile.replaceAll("[^/\\\\]*[/\\\\]", ""); //System.out.println("TokenFile = |" + tokenFile + "|"); String basename = textFile.replaceFirst("\\.lktext\\.xml", ""); basename = basename.replaceAll("[^/\\\\]*[/\\\\]", ""); basename = outDir + File.separatorChar + basename; String outPOSFile = basename + ".pos.xml"; PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outPOSFile), ENCODING)); posOut.println(""); posOut.println(""); posOut.println(""); posOut.println(" " + textFile + ""); posOut.println(" LTHPOSTagger"); posOut.println(""); posOut.println(""); ArrayList lemmas = new ArrayList(); int tokenIdCounter = 0; int sentenceTokenId = 0; line = taggerInput.readLine(); while(line != null && !line.contains("___END___")) { line = line.trim(); if(!line.equals("")) { String[] ss = line.split("\t"); if(ss.length != 2) { System.err.println(Arrays.toString(ss)); if(line.equals("cannot")) { System.err.println("WARNING: bad tokenization: cannot"); ss = new String[] { "cannot", "VBP" }; } else if(line.equals("rebuilt")) { System.err.println("WARNING: changed tagger output: rebuilt"); ss = new String[] { "rebuilt", "VBN" }; } else if(ss.length == 1) { System.err.println("WARNING: changed tagger output: " + line); ss = new String[] { line, "NN" }; } else throw new RuntimeException("this line: |" + line + "|"); } String token = ss[0]; String pos = ss[1]; String lemma = lemmatize(token, pos); lemmas.add(lemma); sentenceTokenId++; tabularCoNLL08Out.print(sentenceTokenId + "\t"); tabularCoNLL08Out.print(token + "\t"); tabularCoNLL08Out.print(lemma + "\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print(pos + "\t"); tabularCoNLL08Out.print(token + "\t"); tabularCoNLL08Out.print(lemma + "\t"); tabularCoNLL08Out.print(pos + "\t"); tabularCoNLL08Out.print("0\t"); tabularCoNLL08Out.println("ROOT"); int id = ++tokenIdCounter; printEntity(pos, id, -1, id, posOut); } else { sentenceTokenId = 0; tabularCoNLL08Out.println(); } line = taggerInput.readLine(); } System.out.println("End of file: " + line); if(!line.startsWith("___END___|" + outDir + File.separatorChar + tokenFile)) { System.err.println("|" + line + "|"); System.err.println("|___END___|" + outDir + File.separatorChar + tokenFile); throw new RuntimeException("Wrong end tag!"); } line = taggerInput.readLine(); if(!line.equals("")) throw new RuntimeException("!!!"); int nTokens = tokenIdCounter; posOut.println(""); posOut.println(""); for(int i = 0; i < lemmas.size(); i++) { String lemma = lemmas.get(i); int tid = i + 1; int id = tid + nTokens; if(lemma != null && !lemma.equals("_")) printEntity(lemma, tid, -1, id, posOut); } posOut.println(""); posOut.println(""); posOut.close(); tabularCoNLL08Out.print("0\t___END___\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("___END___\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("_\t"); tabularCoNLL08Out.print("0\t"); tabularCoNLL08Out.println("ROOT"); tabularCoNLL08Out.println(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } static void printEntity(String l, int start, int end, int id, PrintWriter out) { StringBuilder sb = new StringBuilder(" "); else sb.append("\" on=\"#" + start + "\">"); sb.append(l); sb.append(""); out.println(sb); } private static HashMap dict; private static void readDict(String file) { try { dict = new HashMap(); BufferedReader dictInput = new BufferedReader(new InputStreamReader(new FileInputStream(file), ENCODING)); String line = dictInput.readLine(); while(line != null) { String[] ss = line.split("\t"); dict.put(ss[0] + "\t" + ss[1], ss[2]); line = dictInput.readLine(); } dictInput.close(); } catch(Exception e) { e.printStackTrace(); System.exit(1); } } private static String lemmatize(String w, String t) { w = w.toLowerCase(); String l = dict.get(w + "\t" + t); if(l != null) return l; if(!t.matches("JJR|JJS|NNS|VBD|VBG|VBN|VBP|VBZ")) return w; return null; } }