""" --- Corpus Clean - Stage 2 Copyright (c) 2008-2009 Gisle Ytrestol (gisley@ifi.uio.no) This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. The first stage of this script takes a folder of Wikipedia Articles in the original Wikipedia markup, and strips the article for unwanted markup. The script is intended to be used together with Tokenizer v1.0, http://www.cis.uni-muenchen.de/~wastl/misc/. The output of this script is a single file which should be used as input for Tokenizer v1.0 . The output from Tokenizer should be used by stage 2, cccp.py. This second stage inserts proper sentence boundaries and allows the user to specify whether the output should be one single file for each corresponding input file, or whether the entire corpus should be dumped into on single file/corpus. Run python cccp.py -h for help! """ import regex,ccp,os,sys,re from optparse import OptionParser class WikiProcessorAfterToken: def __init__(self): pass def removeEosInSource(self,input): while regex.regremoveeosinsource2.search(input): hit = regex.regremoveeosinsource2.search(input) span = hit.span() removedeos = regex.regeos.sub(r'',input[span[0]:span[1]]) removedeos = removedeos.replace("" in line: return articleNumLine elif "" in line: return 1 def saveCorpus(self,input): maxLine = self.maxLine articleLineNum = 1 articleNum = 1 totalLineNum = 0 fileLineNum = 0 fileNum = 1 inputIndex = 0 if maxLine: newFileName,fileNum = self.makeNewFileName(fileNum) else: newFileName, fileNum = self.makeNewArticleFileName(input,fileNum) newFileNameObject = open(newFileName,'w') lineSplit = input.split('\n') for totalLineNum in range(len(lineSplit)): line = lineSplit[totalLineNum] inputIndex = inputIndex + len(line)+1 keepWriting = self.continueArticle(line,fileLineNum,totalLineNum,lineSplit,maxLine) if keepWriting: articleNum,articleLineNum = self.adjustArticleNum(keepWriting,articleNum,articleLineNum) if not "ARTICLE>" in line: readyLine = self.formatLine(line,articleLineNum,articleNum) newFileNameObject.write(readyLine) fileLineNum = fileLineNum + 1 articleLineNum = articleLineNum + 1 if "ARTICLE>" in line: articleNumLine = self.adjustArticleNumLine(line,articleLineNum) if not keepWriting: newFileNameObject.close() if totalLineNum + 3 > len(lineSplit): # Terminates after last article break if maxLine: newFileName,fileNum = self.makeNewFileName(fileNum) else: #newFileName,fileNum = self.makeNewArticleFileName(" ".join(lineSplit[totalLineNum:]),fileNum) newFileName,fileNum = self.makeNewArticleFileName(input[inputIndex:],fileNum) newFileNameObject = open(newFileName,'w') fileLineNum = 0 articleLineNum = 1 articleNum = articleNum + 1 def continueArticle(self,line,articleLineNum,totalLineNum,lineSplit,maxLine): if not maxLine: if line.startswith(''): return False else: return True if maxLine: if not line.startswith(''): return True if line.startswith(''): try: if int(articleLineNum) + lineSplit[totalLineNum+1:].index('') < int(maxLine): return 'new' else: return False except: return False def formatLine(self, line, lineNum, fileNum): fileNum = self.makeFiller(fileNum) lineNum = self.makeFiller(lineNum) #if 'blockquote>' in line: # readyLine = '[1'+str(fileNum)+str(lineNum)+'0]*|'+line+'\n' #else: readyLine = '[1'+str(fileNum)+str(lineNum)+'0] |'+line+'\n' return readyLine def makeFiller(self,num): if num < 10: filler = '00' elif num > 9 and num < 100: filler = '0' elif num >= 100: filler = '' else: print num print "ERROR" return str(filler+str(num)) """ def saveCorpusDump(self,input): if self.dump == 1: newFileNameObject = open(self.output,'w') lineSplit = input.split('\n') lineNum = 0 for line in lineSplit: if not line.startswith(''): readyLine = self.formatLine(line,lineNum,'9999') newFileNameObject.write(readyLine) lineNum = lineNum + 1 newFileNameObject.close() """ def checkSyntax(output,input): if output == None or input == None: return False if os.path.isdir(output) and os.path.isfile(input): return True else: return False #if not maxLine == None or not isinstance(maxLine,int): # return False def main(): parser = OptionParser() parser.add_option("-i", "--input", dest="input", help="Input file for cleansed Wikipedia Source files", metavar="Input File") parser.add_option("-o", "--output", dest="output", help="Output folder/file where cleaned up versions of Wikipedia Source files will be stored. If file name must be provided if the -dump option is used, or a folder name if -dump is not enabled", metavar="Output File") parser.add_option("-l", "--maxline", dest="line", help="Maximum number of lines per file", metavar="Maximum number of lines") options, args = parser.parse_args() input = options.input output = options.output maxLine = options.line if checkSyntax(output,input): pass else: sys.stdout.write("\nERROR WITH THE INPUT/OUTPUT FILES\n") sys.exit(0) wikiStore = WikiStore(output,maxLine) wikiReader = ccp.WikiReader() wikiProcess = WikiProcessorAfterToken() fileObject = open(input,'r') input = wikiReader.readFile(fileObject) input = wikiProcess.processFileAfterToken(input) #wikiStore.saveCorpus(input) wikiStore.saveCorpus(input) #wikiStore.saveCorpus(input) if __name__ == '__main__': main()