"""  --- Corpus Clean - Stage 2
Copyright (c) 2008-2009 Gisle Ytrestol (gisley@ifi.uio.no)
 
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation; either version 2.1 of the License, or (at
your option) any later version.
 
This program is distributed in the hope that it will be useful, but WITHOUT
ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
FITNESS FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public
License for more details.
 

The first stage of this script takes a folder of Wikipedia Articles in the 
original Wikipedia markup, and strips the article for unwanted markup. 
The script is intended to be used together with Tokenizer v1.0, 
http://www.cis.uni-muenchen.de/~wastl/misc/.

The output of this script is a single file which should be used as input
for Tokenizer v1.0 . The output from Tokenizer should be used by stage 2,
cccp.py. This second stage inserts proper sentence boundaries and allows
the user to specify whether the output should be one single file for each
corresponding input file, or whether the entire corpus should be dumped 
into on single file/corpus.

Run python cccp.py -h for help!

"""


import regex,ccp,os,sys,re
from optparse import OptionParser


class WikiProcessorAfterToken:
    def __init__(self):
        pass
        
    
    def removeEosInSource(self,input):
        while regex.regremoveeosinsource2.search(input):
            hit = regex.regremoveeosinsource2.search(input)  
            span = hit.span()
            removedeos = regex.regeos.sub(r'',input[span[0]:span[1]])
            removedeos = removedeos.replace("<source","<___source")
            removedeos = removedeos.replace("</source","</___source")
            input = input[:span[0]]+removedeos+input[span[1]:]
        return input    
    
    def removeEosInCode(self,input):
        while regex.regremoveeosincode2.search(input):
            hit = regex.regremoveeosincode2.search(input)  
            span = hit.span()
            removedeos = regex.regeos.sub(r'',input[span[0]:span[1]])
            removedeos = removedeos.replace("<code","<___code")
            removedeos = removedeos.replace("</code","</___code")
            input = input[:span[0]]+removedeos+input[span[1]:]
        return input    

    
    def processFileAfterToken(self,input):
        
        input = regex.regremoveeosinmath.sub(r'\1 \3',input)
        #input = regex.regremoveeosinsource.sub(r'\1 \3',input)
        
        
        input = self.removeEosInSource(input)
        input = self.removeEosInCode(input)
        
            
        #input = regex.regremoveeosincode.sub(r'\1 \3',input)
        input = regex.regremoveeosinnowiki.sub(r'\1 \3',input)
        input = input.replace("<___","<")
        input = input.replace("</___","</")
        input = self.addNewline(input)
        input = self.removeEmptyLines(input)
        return input
    
    
    def addNewline(self,input):
        input = regex.regneosfirst.sub('',input)
        input = regex.regneosfinal.sub('',input)
        input = regex.regneosremove.sub('',input)
        input = regex.regeos.sub('\n',input) 
        return input
    
    def removeEmptyLines(self,input):
        splitInput = input.split('\n')
        newArticle = ''
        for line in splitInput:
            if not line.strip():
                continue
            else:
                newArticle = newArticle+str(line)+'\n'
        return newArticle


class WikiStore:
    def __init__(self,output,maxLine):
        self.maxLine = maxLine        
        self.adjust = ccp.AdjustName()
        self.output = self.adjust.addSlash(output)


    def makeNewArticleFileName(self,input,fileNum):
        while True:
            titleGroup = regex.regtitlepreprocess.search(input)
            if titleGroup:    #retrive the correct article
                        title = titleGroup.group(1)
                        title = re.sub(r'/','', title)
                        #print input[:100]
                        #print fileNum
                        #print title
            else:
                return False,False
            filler = self.makeFiller(fileNum)
            newFileName = self.output+filler+"."+title
            if not os.path.isfile(newFileName):
                fileNum = fileNum + 1
                return newFileName,fileNum
            else:
                fileNum = fileNum + 1
    
    def makeNewFileName(self,fileNum):
        while True:
            filler = self.makeFiller(fileNum)
            newFileName = self.output+filler[1:]+'.txt' # WeScience.'+filler
            if not os.path.isfile(newFileName):
                fileNum = fileNum + 1
                return newFileName,fileNum
            else:
                fileNum = fileNum + 1
    
    def adjustArticleNum(self,keepWriting,articleNum,articleLineNum):
        if keepWriting == 'new':
            return articleNum + 1,1
        else:
            return articleNum,articleLineNum
    
    def adjustArticleNumLine(self,line,articleNumLine):
        if "<ARTICLE>" in line:
            return articleNumLine
        elif "</ARTICLE>" in line:
            return 1
        
    def saveCorpus(self,input):
        maxLine = self.maxLine
        articleLineNum = 1
        articleNum = 1
        totalLineNum = 0
        fileLineNum = 0 
        fileNum = 1
        inputIndex = 0
        if maxLine:
            newFileName,fileNum = self.makeNewFileName(fileNum)
        else:
            newFileName, fileNum  = self.makeNewArticleFileName(input,fileNum)
        newFileNameObject = open(newFileName,'w') 
        
        lineSplit = input.split('\n')
        for totalLineNum in range(len(lineSplit)):
            line = lineSplit[totalLineNum]
            inputIndex = inputIndex + len(line)+1
            keepWriting =  self.continueArticle(line,fileLineNum,totalLineNum,lineSplit,maxLine)
            if keepWriting:
                articleNum,articleLineNum = self.adjustArticleNum(keepWriting,articleNum,articleLineNum)
                
                if not "ARTICLE>" in line:
                    readyLine = self.formatLine(line,articleLineNum,articleNum)
                    newFileNameObject.write(readyLine)
                    fileLineNum = fileLineNum + 1
                    articleLineNum = articleLineNum + 1
                    
                    
                if "ARTICLE>" in line:
                    articleNumLine = self.adjustArticleNumLine(line,articleLineNum)
            if not keepWriting:
                     newFileNameObject.close()
                     if totalLineNum + 3 > len(lineSplit): # Terminates after last article
                         break
                     if maxLine:
                         newFileName,fileNum = self.makeNewFileName(fileNum)
                     else:
                         #newFileName,fileNum = self.makeNewArticleFileName(" ".join(lineSplit[totalLineNum:]),fileNum)
                         newFileName,fileNum = self.makeNewArticleFileName(input[inputIndex:],fileNum)
                     newFileNameObject = open(newFileName,'w')
                     fileLineNum = 0
                     articleLineNum = 1
                     articleNum = articleNum + 1
                     
       
    def continueArticle(self,line,articleLineNum,totalLineNum,lineSplit,maxLine):
        if not maxLine:
            if line.startswith('</ARTICLE>'):
                return False
            else:
                return True
        
        if maxLine:    
            if not line.startswith('</ARTICLE>'):
                return True
            
            if line.startswith('</ARTICLE>'):
                try:
                    if int(articleLineNum) + lineSplit[totalLineNum+1:].index('</ARTICLE>') < int(maxLine):
                        return 'new'
                    else:       
                        return False
                except:
                    return False
                
    def formatLine(self, line, lineNum, fileNum):
        fileNum = self.makeFiller(fileNum)
        lineNum = self.makeFiller(lineNum)
        #if 'blockquote>' in line:
        #    readyLine = '[1'+str(fileNum)+str(lineNum)+'0]*|'+line+'\n'
        #else:
        readyLine = '[1'+str(fileNum)+str(lineNum)+'0] |'+line+'\n'
        return readyLine
                 
    def makeFiller(self,num):
        
        if num < 10:
                filler = '00'
        elif num > 9 and num < 100:
                filler = '0'
        elif num >= 100:
                filler = ''
        else:
            print num
            print "ERROR"
        return str(filler+str(num))
        
    
    """            
    def saveCorpusDump(self,input):
                     
        if self.dump == 1:
            newFileNameObject = open(self.output,'w')
            lineSplit = input.split('\n')
            lineNum = 0
            for line in lineSplit:
                if not line.startswith('</ARTICLE>'):
                    readyLine = self.formatLine(line,lineNum,'9999')
                    newFileNameObject.write(readyLine)
                    lineNum = lineNum + 1
            newFileNameObject.close()
    
    """      
             
         
def checkSyntax(output,input): 
    if output == None or input == None:
        return False
    if os.path.isdir(output) and os.path.isfile(input):
        return True
    else:
        return False
    
    
    #if not maxLine == None or not isinstance(maxLine,int):
    #    return False


def main():
    parser = OptionParser()
    parser.add_option("-i", "--input", dest="input",
                  help="Input file for cleansed Wikipedia Source files", metavar="Input File")
    parser.add_option("-o", "--output", dest="output",
                  help="Output folder/file where cleaned up versions of Wikipedia Source files will be stored. If file name must be provided if the -dump option is used, or a folder name if -dump is not enabled", metavar="Output File")
    
    
    parser.add_option("-l", "--maxline", dest="line", 
                  help="Maximum number of lines per file", metavar="Maximum number of lines")
    
    
    options, args = parser.parse_args()
    input = options.input
    output = options.output
    
    maxLine = options.line
    
    
    if checkSyntax(output,input):
        pass
    else:
         sys.stdout.write("\nERROR WITH THE INPUT/OUTPUT FILES\n")
         sys.exit(0)
    wikiStore = WikiStore(output,maxLine)
    wikiReader = ccp.WikiReader()
    wikiProcess = WikiProcessorAfterToken()
    fileObject = open(input,'r')
    input = wikiReader.readFile(fileObject)
    input = wikiProcess.processFileAfterToken(input)
    #wikiStore.saveCorpus(input)
    
    
    wikiStore.saveCorpus(input)
    
        #wikiStore.saveCorpus(input)


if __name__ == '__main__':
    main()