""" --- Corpus Clean - Stage 2 Copyright (c) 2008-2010 Gisle Ytrestol (gisley@ifi.uio.no) This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. The first stage of this script takes a folder of Wikipedia Articles in the original Wikipedia markup, and strips the article for unwanted markup. The script is intended to be used together with Tokenizer v1.0, http://www.cis.uni-muenchen.de/~wastl/misc/. The output of this script is a single file which should be used as input for Tokenizer v1.0 . The output from Tokenizer should be used by stage 2, cccp.py. This second stage inserts proper sentence boundaries and allows the user to specify whether the output should be one single file for each corresponding input file, or whether the entire corpus should be dumped into on single file/corpus. Example of use: whole wiki: python cccp.py -i t2 -l 10000000 -o testOutput/ -r /100 -t wholeWiki wescience: python cccp.py -i t2 -l 1000 -o testOutput/ Run python cccp.py -h for help! """ import regex,ccp,os,sys,re from optparse import OptionParser class WikiProcessorAfterToken: def __init__(self): pass def removeEosInSource(self,input): while regex.regremoveeosinsource2.search(input): hit = regex.regremoveeosinsource2.search(input) span = hit.span() removedeos = regex.regeos.sub(r'',input[span[0]:span[1]]) removedeos = removedeos.replace("" in line: if regex.regonlyXML.match(line): continue if len(line) > 1000 and regex.regmultiplePipes.search(line): continue if len(line) > 1000 and "<" in line and ">" in line: continue if regex.regletternumber.search(line): keepLine.append(line) return "\n".join(keepLine) def addNewline(self,input): input = regex.regneosfirst.sub('',input) input = regex.regneosfinal.sub('',input) input = regex.regneosremove.sub('',input) input = regex.regeos.sub('\n',input) return input def removeEmptyLines(self,input): splitInput = input.split('\n') newArticle = '' for line in splitInput: if not line.strip(): continue else: newArticle = newArticle+str(line)+'\n' return newArticle # def removeTableLeftover(self,input): # splitInput = input.split('\n') # newArticle = '' # for line in splitInput: # if not regex.regletternumber.search(line): # continue # if not "ARTICLE>" in line: # if regex.regonlyXML.match(line): # continue # if line.startswith("|") or line.startswith("!") or line.startswith("{|") or line.startswith("|}"): # continue # else: # newArticle = newArticle+str(line)+'\n' # return newArticle.rstrip() # class WikiStoreWeScience: def __init__(self,output,maxLine): self.maxLine = maxLine self.adjust = ccp.AdjustName() self.output = self.adjust.addSlash(output) def makeNewArticleFileName(self,input,fileNum): while True: titleGroup = regex.regtitlepreprocess.search(input) if titleGroup: #retrive the correct article title = titleGroup.group(1) title = re.sub(r'/','', title) #print input[:100] #print fileNum #print title else: return False,False filler = self.makeFiller(fileNum) newFileName = self.output+filler+"."+title if not os.path.isfile(newFileName): fileNum = fileNum + 1 return newFileName,fileNum else: fileNum = fileNum + 1 def makeNewFileName(self,fileNum): while True: filler = self.makeFiller(fileNum) newFileName = self.output+filler[1:]+'.txt' # WeScience.'+filler if not os.path.isfile(newFileName): fileNum = fileNum + 1 return newFileName,fileNum else: fileNum = fileNum + 1 def adjustArticleNum(self,keepWriting,articleNum,articleLineNum): if keepWriting == 'new': return articleNum + 1,1 else: return articleNum,articleLineNum def adjustArticleNumLine(self,line,articleNumLine): if "
" in line: return articleNumLine elif "
" in line: return 1 def saveCorpus(self,input): maxLine = self.maxLine articleLineNum = 1 articleNum = 1 totalLineNum = 0 fileLineNum = 0 fileNum = 1 inputIndex = 0 if maxLine: newFileName,fileNum = self.makeNewFileName(fileNum) else: newFileName, fileNum = self.makeNewArticleFileName(input,fileNum) newFileNameObject = open(newFileName,'w') lineSplit = input.split('\n') for totalLineNum in range(len(lineSplit)): line = lineSplit[totalLineNum] inputIndex = inputIndex + len(line)+1 keepWriting = self.continueArticle(line,fileLineNum,totalLineNum,lineSplit,maxLine) if keepWriting: articleNum,articleLineNum = self.adjustArticleNum(keepWriting,articleNum,articleLineNum) if not "ARTICLE>" in line: readyLine = self.formatLine(line,articleLineNum,articleNum) newFileNameObject.write(readyLine) fileLineNum = fileLineNum + 1 articleLineNum = articleLineNum + 1 if "ARTICLE>" in line: articleNumLine = self.adjustArticleNumLine(line,articleLineNum) if not keepWriting: newFileNameObject.close() if totalLineNum + 3 > len(lineSplit): # Terminates after last article break if maxLine: newFileName,fileNum = self.makeNewFileName(fileNum) else: #newFileName,fileNum = self.makeNewArticleFileName(" ".join(lineSplit[totalLineNum:]),fileNum) newFileName,fileNum = self.makeNewArticleFileName(input[inputIndex:],fileNum) newFileNameObject = open(newFileName,'w') fileLineNum = 0 articleLineNum = 1 articleNum = articleNum + 1 def continueArticle(self,line,articleLineNum,totalLineNum,lineSplit,maxLine): if not maxLine: if line.startswith(''): return False else: return True if maxLine: if not line.startswith(''): return True if line.startswith(''): try: if int(articleLineNum) + lineSplit[totalLineNum+1:].index('') < int(maxLine): return 'new' else: return False except: return False def formatLine(self, line, lineNum, fileNum): fileNum = self.makeFiller(fileNum) lineNum = self.makeFiller(lineNum) #if 'blockquote>' in line: # readyLine = '[1'+str(fileNum)+str(lineNum)+'0]*|'+line+'\n' #else: readyLine = '[1'+str(fileNum)+str(lineNum)+'0] |'+line+'\n' return readyLine def makeFiller(self,num): if num < 10: filler = '00' elif num > 9 and num < 100: filler = '0' elif num >= 100: filler = '' else: print num print "ERROR" return str(filler+str(num)) class WikiStore: def __init__(self,output,maxLine,outputFile,inputFolder): self.maxLine = maxLine self.adjust = ccp.AdjustName() self.output = self.adjust.addSlash(output) self.outputFile = outputFile self.inputFolder = inputFolder self.folderNum = self.getFoldNum(self.inputFolder) def getFoldNum(self,inputFolder): num = inputFolder[:-1].rindex('/') return inputFolder[num+1:] def makeNewArticleFileName(self,input,fileNum): while True: titleGroup = regex.regtitlepreprocess.search(input) if titleGroup: #retrive the correct article title = titleGroup.group(1) title = re.sub(r'/','', title) #print input[:100] #print fileNum #print title else: return False,False filler = self.makeLineNumFiller(fileNum,4) newFileName = self.output+filler+"."+title if not os.path.isfile(newFileName): fileNum = fileNum + 1 return newFileName,fileNum else: fileNum = fileNum + 1 def makeNewFileName(self,fileNum): newFileName = self.output+self.folderNum+'.txt' # WeScience.'+filler return newFileName,fileNum+1 def adjustArticleNum(self,keepWriting,articleNum,articleLineNum): if keepWriting == 'new': return articleNum + 1,1 else: return articleNum,articleLineNum def adjustArticleNumLine(self,line,articleNumLine): if "
" in line: return articleNumLine elif "
" in line: return 1 def saveCorpus(self,input): maxLine = self.maxLine articleLineNum = 0 #articleLineNum = self.folderNum + 00 articleNum = 0 articleNum = (int(self.folderNum)-100)*100 #print articleNum fileLineNum = 0 totalLineNum = 0 fileNum = 1 inputIndex = 0 if maxLine: newFileName,fileNum = self.makeNewFileName(fileNum) else: newFileName, fileNum = self.makeNewArticleFileName(input,fileNum) #newFileName = newFileNameObject = open(newFileName,'w') """ if self.outputFile: newFileNameObject = open(self.outputFile,'w') else: newFileNameObject = open(newFileName,'w') """ lineSplit = input.split('\n') for totalLineNum in range(len(lineSplit)): line = lineSplit[totalLineNum] inputIndex = inputIndex + len(line)+1 keepWriting = self.continueArticle(line,fileLineNum,totalLineNum,lineSplit,maxLine) if keepWriting: articleNum,articleLineNum = self.adjustArticleNum(keepWriting,articleNum,articleLineNum) if not "ARTICLE>" in line: readyLine = self.formatLine(line,articleLineNum,articleNum) newFileNameObject.write(readyLine) fileLineNum = fileLineNum + 1 articleLineNum = articleLineNum + 1 if "ARTICLE>" in line: articleNumLine = self.adjustArticleNumLine(line,articleLineNum) if not keepWriting: newFileNameObject.close() if totalLineNum + 3 > len(lineSplit): # Terminates after last article break if maxLine: newFileName,fileNum = self.makeNewFileName(fileNum) else: #newFileName,fileNum = self.makeNewArticleFileName(" ".join(lineSplit[totalLineNum:]),fileNum) newFileName,fileNum = self.makeNewArticleFileName(input[inputIndex:],fileNum) newFileNameObject = open(newFileName,'w') fileLineNum = 0 articleLineNum = 1 articleNum = articleNum + 1 def continueArticle(self,line,articleLineNum,totalLineNum,lineSplit,maxLine): if not maxLine: if line.startswith(''): return False else: return True if maxLine: if not line.startswith(''): return True if line.startswith(''): try: if int(articleLineNum) + lineSplit[totalLineNum+1:].index('') < int(maxLine): return 'new' else: return False except: return False def formatLine(self, line, lineNum, fileNum): """ fileNum = self.makeFiller(fileNum) lineNum = self.makeFiller(lineNum) """ fileNum = self.makeLineNumFiller(fileNum,7) lineNum = self.makeLineNumFiller(lineNum,4) #if 'blockquote>' in line: # readyLine = '[1'+str(fileNum)+str(lineNum)+'0]*|'+line+'\n' #else: readyLine = '[1'+str(fileNum)+str(lineNum)+'0] |'+line+'\n' return readyLine def makeLineNumFiller(self,num,total): totalLine = total - 1 stringNum = str(num) while True: if len(stringNum) > totalLine: return stringNum else: stringNum = '0'+stringNum """ def makeFiller(self,num): if num < 10: filler = '00' elif num > 9 and num < 100: filler = '0' elif num >= 100: filler = '' else: print num print "ERROR" return str(filler+str(num)) """ """ def saveCorpusDump(self,input): if self.dump == 1: newFileNameObject = open(self.output,'w') lineSplit = input.split('\n') lineNum = 0 for line in lineSplit: if not line.startswith(''): readyLine = self.formatLine(line,lineNum,'9999') newFileNameObject.write(readyLine) lineNum = lineNum + 1 newFileNameObject.close() """ def checkSyntax(output,input): if output == None or input == None: return False if os.path.isdir(output) and os.path.isfile(input): return True else: return False #if not maxLine == None or not isinstance(maxLine,int): # return False def main(): parser = OptionParser() parser.add_option("-i", "--input", dest="input", help="Input file for cleansed Wikipedia Source files", metavar="Input File") parser.add_option("-o", "--outputfolder", dest="output", help="Output folder/file where cleaned up versions of Wikipedia Source files will be stored. If file name must be provided if the -dump option is used, or a folder name if -dump is not enabled", metavar="Output Folder") parser.add_option("-f", "--outputfile", dest="outputFile", help="Output file", metavar="Output File name") parser.add_option("-r", "--inputFolder", dest="inputFolder", help="Input folder for raw Wikipedia Source files", metavar="Input Folder") parser.add_option("-l", "--maxline", dest="line", help="Maximum number of lines per file", metavar="Maximum number of lines") parser.add_option("-t", "--type", dest="type", help="Type of output format, wholeWiki or WeScience", metavar="Output Format") options, args = parser.parse_args() input = options.input output = options.output maxLine = options.line outputFile = options.outputFile inputFolder = options.inputFolder outputFormat = options.type if checkSyntax(output,input): pass else: sys.stdout.write("\nERROR WITH THE INPUT/OUTPUT FILES\n") sys.exit(0) if not outputFormat =="wholeWiki": inputFolder = False outputFile = False wikiStore = WikiStoreWeScience(output,maxLine) else: wikiStore = WikiStore(output,maxLine,outputFile,inputFolder) wikiReader = ccp.WikiReader() wikiProcess = WikiProcessorAfterToken() fileObject = open(input,'r') input = wikiReader.readFile(fileObject) input = wikiProcess.processFileAfterToken(input) #wikiStore.saveCorpus(input) wikiStore.saveCorpus(input) #wikiStore.saveCorpus(input) if __name__ == '__main__': main()