package lkformat2;

import java.io.*;
import java.util.*;
//import java.util.regex.*;

//import mpqareader.SubjectivityLexicon;

public class POSTagsToLK {

    private static final String ENCODING = "UTF-8";
    
    public static void main(String[] argv) {
    	String lkDir = argv[0];
    	String sstOutputFile = argv[1];
    	String conll2008OutFile = argv[2];
    	//String subjExpInputFile = argv[3];
    	String outDir = argv[3];
    	String dict = argv[4];
    	//String subjLexFile = argv[6];
    	
    	//System.out.println("argv = " + Arrays.toString(argv));
    	
    	try {
        	readDict(dict);

        	//SubjectivityLexicon subjLex = new SubjectivityLexicon(subjLexFile);
        	
    		BufferedReader taggerInput = new BufferedReader(new InputStreamReader(new FileInputStream(sstOutputFile), ENCODING));
    		PrintWriter conll2008Out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(conll2008OutFile), ENCODING));    		
    		//PrintWriter subjExpOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(subjExpInputFile), ENCODING));
    		String[] files = new File(lkDir).list();
    		Arrays.sort(files);
    		for(String file: files) {
    			if(file.endsWith("lktext.xml"))
    				processFile(lkDir + File.separatorChar + file, taggerInput, 
    						conll2008Out, outDir);
    		}
    		
    		taggerInput.close();
    		conll2008Out.close();
    		
    	} catch(Exception e) {
    		e.printStackTrace();
    		System.exit(1);
    	}
    	
    }

    public static void processFile(String textFile, BufferedReader taggerInput, 
    		PrintWriter tabularCoNLL08Out, String outDir) {
    	try {
    		textFile = textFile.replaceAll("[^/]*/", "");

    		System.out.println("Reading tags for base text |" + textFile + "|");
    		
    		String line = taggerInput.readLine();
    		if(line == null)
    			throw new RuntimeException("Expected beginning of file, but came to end");
    		if(!line.startsWith("___BEGIN___"))
    			throw new RuntimeException("Expected beginning of file: line = |" + line + "|");
    		String tokenFile = line.substring("___BEGIN___|".length());
    		tokenFile = tokenFile.replaceAll("\t.*", "");

    		System.out.println("Token file is |" + tokenFile + "|");
    		
    		tabularCoNLL08Out.print("1\t___BEGIN___|" + tokenFile + "\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("___BEGIN___|" + tokenFile + "\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("0\t");
            tabularCoNLL08Out.println("ROOT");
            //tabularCoNLL08Out.println();
            
    		tokenFile = tokenFile.replaceAll("[^/\\\\]*[/\\\\]", "");
    		//System.out.println("TokenFile = |" + tokenFile + "|");
    		
    		String basename = textFile.replaceFirst("\\.lktext\\.xml", "");
    		basename = basename.replaceAll("[^/\\\\]*[/\\\\]", "");
    		basename = outDir + File.separatorChar + basename;
    		
    		String outPOSFile = basename + ".pos.xml";

            PrintWriter posOut = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outPOSFile), ENCODING));
            
            posOut.println("<?xml version=\"1.0\" encoding=\"" + ENCODING + "\"?>");
            posOut.println("<lk-annotation>");
            posOut.println("<meta-info>");
            posOut.println("  <tag name=\"base\">" + textFile + "</tag>");
            posOut.println("  <tag name=\"annotator\">LTHPOSTagger</tag>");
            posOut.println("</meta-info>");
            posOut.println("<annotation scope=\"" + tokenFile + "\" provides=\"POS\">");

            ArrayList<String> lemmas = new ArrayList();            
            
            int tokenIdCounter = 0;
            int sentenceTokenId = 0;
            
            line = taggerInput.readLine();
            while(line != null && !line.contains("___END___")) {
                line = line.trim();
                if(!line.equals("")) {
                    String[] ss = line.split("\t");
                    if(ss.length != 2) {
                    	System.err.println(Arrays.toString(ss));
                    	if(line.equals("cannot")) {
                        	System.err.println("WARNING: bad tokenization: cannot");
                        	ss = new String[] { "cannot", "VBP" };
                    	} else if(line.equals("rebuilt")) {
                        	System.err.println("WARNING: changed tagger output: rebuilt");
                        	ss = new String[] { "rebuilt", "VBN" };
                    	} else if(ss.length == 1) {
                        	System.err.println("WARNING: changed tagger output: " + line);
                    		ss = new String[] { line, "NN" };
                    	} else
                    		throw new RuntimeException("this line: |" + line + "|");
                    }

                    String token = ss[0];
                    String pos = ss[1];
                    String lemma = lemmatize(token, pos);
                    lemmas.add(lemma);

                    sentenceTokenId++;
                    
                    tabularCoNLL08Out.print(sentenceTokenId + "\t");
                    tabularCoNLL08Out.print(token + "\t");
                    tabularCoNLL08Out.print(lemma + "\t");
                    tabularCoNLL08Out.print("_\t");
                    tabularCoNLL08Out.print(pos + "\t");
                    tabularCoNLL08Out.print(token + "\t");
                    tabularCoNLL08Out.print(lemma + "\t");
                    tabularCoNLL08Out.print(pos + "\t");
                    tabularCoNLL08Out.print("0\t");
                    tabularCoNLL08Out.println("ROOT");

                    int id = ++tokenIdCounter;
                    printEntity(pos, id, -1, id, posOut);
                } else {
                	sentenceTokenId = 0;
                    tabularCoNLL08Out.println();
                }
                
                line = taggerInput.readLine();
            }

            System.out.println("End of file: " + line);
            if(!line.startsWith("___END___|" + outDir + File.separatorChar + tokenFile)) {
            	System.err.println("|" + line + "|");
            	System.err.println("|___END___|" + outDir + File.separatorChar + tokenFile);
            	throw new RuntimeException("Wrong end tag!");
            }
            
            line = taggerInput.readLine();
            if(!line.equals(""))
            	throw new RuntimeException("!!!");            	
            
            int nTokens = tokenIdCounter;
            
            posOut.println("</annotation>");
            posOut.println("<annotation scope=\"" + tokenFile + "\" provides=\"LEMMA\">");
            
            for(int i = 0; i < lemmas.size(); i++) {
                String lemma = lemmas.get(i);
                int tid = i + 1;
                int id = tid + nTokens;
                if(lemma != null && !lemma.equals("_"))
                	printEntity(lemma, tid, -1, id, posOut);                
            }
            
            posOut.println("</annotation>");
            posOut.println("</lk-annotation>");
            posOut.close();
            
    		tabularCoNLL08Out.print("0\t___END___\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("___END___\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("_\t");
            tabularCoNLL08Out.print("0\t");
            tabularCoNLL08Out.println("ROOT");
    		tabularCoNLL08Out.println();
            
    	} catch(Exception e) {
    		e.printStackTrace();
    		System.exit(1);
    	}
    }
    
    static void printEntity(String l, int start, int end,
            int id, PrintWriter out) {
        StringBuilder sb = new StringBuilder("  <e id=\"");
        sb.append(id);
        if(end != -1)
            sb.append("\" start=\"#" + start + "\" end=\"#" + end + "\">");  
        else
            sb.append("\" on=\"#" + start + "\">");
        sb.append(l);
        sb.append("</e>");
        out.println(sb);
    }
    
    private static HashMap<String, String> dict;
    
    private static void readDict(String file) {
    	try {
    		dict = new HashMap();
    		BufferedReader dictInput = new BufferedReader(new InputStreamReader(new FileInputStream(file), ENCODING));
    		String line = dictInput.readLine();
    		while(line != null) {
    			String[] ss = line.split("\t");
    			dict.put(ss[0] + "\t" + ss[1], ss[2]);
    			line = dictInput.readLine();
    		}
    		dictInput.close();
    	} catch(Exception e) {
    		e.printStackTrace();
    		System.exit(1);
    	}
    }
    
    private static String lemmatize(String w, String t) {
    	w = w.toLowerCase();
    	String l = dict.get(w + "\t" + t);
    	if(l != null)
    		return l;
    	if(!t.matches("JJR|JJS|NNS|VBD|VBG|VBN|VBP|VBZ"))
    		return w;
    	return null;
    }
    
}