# -*- coding: utf-8 -*- """ --- Corpus Clean - Stage 1 Copyright (c) 2008-2010 Gisle Ytrestol (gisley@ifi.uio.no) This program is free software; you can redistribute it and/or modify it under the terms of the GNU Lesser General Public License as published by the Free Software Foundation; either version 2.1 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details. The first stage of this script takes a folder of Wikipedia articles in the original Wikipedia markup, and strips the article for unwanted markup. The script is intended to be used together with Tokenizer v1.0, http://www.cis.uni-muenchen.de/~wastl/misc/. The output of this script is a single file which should be used as input for Tokenizer v1.0 . The output from Tokenizer should be used by stage 2, cccp.py. This second stage inserts proper sentence boundaries and allows the user to specify whether the output should be one single file for each corresponding input file, or whether the entire corpus should be dumped into on single file/corpus. Run python ccp.py -h for help! """ import os, re, regex, urllib,cccp import sys,string import codecs from optparse import OptionParser class AdjustName: def addSlash(self,name): if name[-1] != '/': name = name+'/' return name else: return name def removeSlash(self,name): if name[-1] == '/': name = name[:-1] return name else: return name class WikiReader: def readFile(self,file): wholeFile = file.read() return wholeFile def listFiles(self,inputFolder): fileList = os.listdir(inputFolder) #print fileList newFileList = [] for file in fileList: if not file[0]== '.': newFileList.append(file) newFileList.sort() #print newFileList return newFileList def readFirstLine(self,file): firstLine = file.readline() return firstLine class WikiProcessor: def __init__(self,redirect): self.dictChecker = WikiDict() self.redirect = redirect if self.redirect == None: self.redirect = 1 def tableCleaner(self,input): input = regex.regtableConvertStart.sub(r'Ӂ',input) input = regex.regtableConvertEnd.sub(r'\1ጣ',input) while True: if regex.regtableConverted.search(input): input = regex.regtableConverted.sub(r'',input) else: break input = regex.regtableConvertRevertStart.sub('{|',input) input = regex.regtableConvertRevertEnd.sub('|}',input) return input def tableCleanerStephan(self,input): #print input while True: if regex.regtableGisleImproved2.search(input): input = regex.regtableGisleImproved2.sub(r'',input) else: break return input def tableCleaner2(self,input): splitInput = input.split("\n") keepInput = [] inTable = False for inputLine in splitInput: if inputLine.startswith("|") or inputLine.startswith("!") : if inTable == True: continue regtablestart if regex.regtablestart.search(inputLine): #if inputLine.startswith("{|"): inTable = True continue if inputLine.startswith("|}") and inTable == True: inTable = False continue if inTable == True: if not inputLine.startswith("|"): if not inputLine.startswith("!"): inTable = False keepInput.append(inputLine) return "\n".join(keepInput) def tableCleaner2Reverse(self,input): splitInput = input.split("\n") splitInput.reverse() keepInput = [] inTable = False for inputLine in splitInput: if inputLine.startswith("|") or inputLine.startswith("!") : if inTable == True: continue if inputLine.startswith("|}"): inTable = True continue if inputLine.startswith("{|") and inTable == True: inTable = False continue if inTable == True: if not inputLine.startswith("|"): if not inputLine.startswith("!"): inTable = False keepInput.append(inputLine) keepInput.reverse() return "\n".join(keepInput) #removes every line that starts with ! or | (normally tables)" def tableCleaner3(self,input): #print input splitInput = input.split("\n") keepInput = [] for inputLine in splitInput: if inputLine.startswith("|") or inputLine.startswith("!") : continue keepInput.append(inputLine) return "\n".join(keepInput) def processFile(self,firstLine,wholeFile): if self.redirect == 1: try: firstLine, wholeFile = self.redirectCheck(firstLine,wholeFile) except: sys.stdout.write("\nERROR WITH THE REDIRECT PROCESSING\nARE YOU SURE AN OFFLINE WIKIPEDIA READER IS ENABLED?\n\nTo run the script without redirect processing, use the -n option\n") #sys.exit(0) if self.dictChecker.checkIfIn(firstLine): firstLine,wholeFile = self.cleanArticle(firstLine,wholeFile) firstLine = self.addTitle(firstLine) return firstLine,wholeFile else: return False, False def cleanArticle(self,firstLine,wholeFile): firstline = self.regCleanFile(firstLine) wholeFile = self.regCleanFile(wholeFile) wholeFile = self.removeEnd(wholeFile) return firstLine,wholeFile def addTitle(self,firstLine): firstLine = '
'+firstLine.rstrip()+'
' return firstLine def redirectCheck(self,firstLine,wholeFile): # if the article contains a redirect link, the redirect URL will be used to if regex.regredirect.search(wholeFile): #retrive the correct article title = regex.regredirect.sub(r'\1',wholeFile) url = 'http://127.0.0.1:8000/article/'+title #address must correspond with local Wikipedia url = re.sub(r' ','_',url) #print url page = urllib.urlopen(url) page.close() thisfile = '/var/tmp/result' #the wiki article is stored here when it is accessed file = open(thisfile,'r') firstLine = file.readline() wholeFile = file.read() return firstLine,wholeFile else: return firstLine,wholeFile def addNewline(self,input): input = regex.regeos.sub('\n',input) return input def removeEnd(self,input): input = regex.regsourcelookahead.sub(r'___',input) input = regex.regsourcelookahead.sub(r'___',input) input = regex.regsourcelookahead.sub(r'___',input) input = regex.regseealsolookahead.sub(r'___',input) input = regex.regseealsolookahead.sub(r'___',input) input = regex.regseealsolookahead.sub(r'___',input) input = regex.regnoteslookahead.sub(r'___',input) input = regex.regnoteslookahead.sub(r'___',input) input = regex.regnoteslookahead.sub(r'___',input) input = regex.regreflookahead.sub(r'___',input) input = regex.regreflookahead.sub(r'___',input) input = regex.regreflookahead.sub(r'___',input) input = regex.regsealso.sub('',input) input = regex.regnotes.sub('',input) input = regex.regreferences.sub('',input) input = regex.regsources.sub('',input) input = regex.regsourcelookaheadrestore.sub(r'\2\1\2',input) input = regex.regseealsolookaheadrestore.sub(r'\2\1\2',input) input = regex.regnoteslookaheadrestore.sub(r'\2\1\2',input) input = regex.regreflookaheadrestore.sub(r'\2\1\2',input) input = regex.regbibliography.sub('',input) input = regex.regfootnotes.sub('',input) input = regex.regrelated.sub('',input) input = regex.regexternal.sub('',input) return input def regCleanFile(self,input): input = regex.regipa.sub(r'<___\1___>',input) input = regex.regjava.sub(r'<___\1___>',input) input = regex.regiast.sub(r'<___\1___>',input) ## japanese article templates #input = regex.regtransjap.sub(r'\1',input) #input = regex.reglangjap.sub(r'\1',input) input = regex.reglanggeneralpreserve.sub(r'<___\1___>',input) input = regex.regtransgeneralpreserve.sub(r'<___\1___>',input) input = regex.regnihongopreservere.sub(r'<___\1___>',input) input = regex.regharv_general.sub(r'<___\1___>',input) input = regex.regaudio_general.sub(r'<___\1___>',input) input = regex.regflagtemplate.sub(r'<___\1___>',input) """ IF WE WANT TO EXPAND THE TEMPLATES, USE THESE! input = regex.regharv_aut_aut_year_page.sub(r'(\1 & \2 \3, \4)',input) input = regex.regharv_aut_year_page.sub(r'(\1 2, \3)',input) input = regex.regharvtxt_aut_year_page.sub(r'\1 (\2, \3)',input) input = regex.regharvtxt_aut_year.sub(r'\1 (\2)',input) input = regex.regharvtxt_aut_aut_year_page.sub(r'\1 & \2 (\3, \4)',input) input = regex.regharvtxt_aut_aut_year.sub(r'\1 & \2 (\3)',input) input = regex.regharvnb_aut_year_page_nb.sub(r'\1 \2, \3',input) input = regex.regharvnb_aut_year_nb.sub(r'\1 \2',input) input = regex.regharvnb_aut_aut_year_page_nb.sub(r'\1 & \2 \3, \4',input) #input = regex.regharvnb_aut_aut_year_page_nb.sub(r'\1 & \2 \3, \4',input) input = regex.regharvnb_aut_aut_aut_year_page_nb.sub(r'\1, \2 & \3 \4, \5',input) input = regex.regharvnb_aut_aut_year_nb.sub(r'\1 & \2 \3',input) input = regex.regharvnb_aut_aut_aut_year_nb.sub(r'\1, \2 & \3 \4',input) input = regex.regharvcoltxt_aut_year_page.sub(r'\1 (\2:\3)',input) """ """ TO EXPAND TEMPLATES, USE THESE! #input = regex.regtransgeneral.sub(r'\1',input) #input = regex.reglanggeneral.sub(r'\1',input) input = regex.regnihongohardcode.sub(r'\1)',input) input = regex.regnihongojap5.sub(r'\1 (\2 \3 \4 \5)',input) input = regex.regnihongojap4.sub(r'\1 (\2 \3 \4)',input) input = regex.regnihongojap3.sub(r'\1 (\2 \3)',input) input = regex.regnihongojap2.sub(r'\1 (\2)',input) """ #while regex.regcurly1.search(input): # input = regex.regcurly1.sub('',input) while regex.reglongTemp.search(input): input = regex.reglongTemp.sub('',input) while regex.regboxtable.search(input): input = regex.regboxtable.sub('',input) """ input = regex.regcurly1.sub('',input) input = regex.regcurly1.sub('',input) input = regex.regcurly1.sub('',input) input = regex.regcurly1.sub('',input) input = regex.regcurly1.sub('',input) """ input = regex.regblockquote.sub(r'
\1
',input) input = regex.regdiv2.sub('',input) input = regex.reggallery.sub('',input) input = regex.regimage.sub(r'',input) input = regex.regimage.sub(r'',input) input = regex.regref2.sub('',input) input = regex.regref.sub('',input) input = regex.regcomment.sub('',input) input = regex.regsingleast.sub('',input) input = regex.regdeflist.sub(r'\1',input) #input = self.tableCleaner(input) input = self.tableCleanerStephan(input) input = self.tableCleaner2(input) input = self.tableCleaner2Reverse(input) input = regex.regwikitable3.sub('',input) """ INCLUDED IF NOT NOW oct 8 input = regex.regwikitable2.sub('',input) input = regex.regwikitable2.sub('',input) input = regex.regwikitable2.sub('',input) input = regex.regwikitable2.sub('',input) input = regex.regwikitable2.sub('',input) input = regex.regwikitable3.sub('',input) #new """ input = regex.regtable.sub('',input) input = regex.regtableborder.sub('',input) input = regex.regtablehardcode.sub('',input) input = regex.regcategory.sub('',input) #input = self.tableCleaner3(input) input = regex.regbacktocurly1.sub('{{',input) input = regex.regbacktocurly2.sub('}}',input) input = regex.regsentinitialbracket.sub(r'\1\2',input) input = regex.regbracket.sub(r'\1', input) input = regex.regbullets.sub(r'\1', input) input = regex.regbullets2.sub(r'\1', input) input = regex.regindentcolon.sub(r'\1', input) input = regex.regbulletscolon.sub(r'\2', input) input = regex.regbr.sub(r'',input) input = regex.regtitle.sub(r'\1',input) input = regex.regparagraph.sub('',input) input = regex.regyeareos.sub(r'\1',input) input = regex.regorg.sub(r'\1',input) input = regex.reghyphen.sub('',input) #remove no wiki #input = regex.regremovenowiki.sub('',input) input = self.removeLines(input) input = regex.regremovenewline.sub(' ',input) return input def removeTableLeftover(self,input): splitInput = input.split('\n') newArticle = '' for line in splitInput: if not regex.regletternumber.search(line): continue if not "ARTICLE>" in line: if regex.regonlyXML.match(line): continue if line.startswith("|") or line.startswith("!") or line.startswith("{|") or line.startswith("|}"): continue else: newArticle = newArticle+str(line)+'\n' return newArticle.rstrip() def removeLines(self,line): splitLine = line.split("\n") keepLine = [] for line in splitLine: if regex.regletternumber.search(line): keepLine.append(line) return "\n".join(keepLine) class WikiDict: def __init__(self,wikiDict=None): self.wikiDict = {} def checkIfIn(self,firstLine): if firstLine in self.wikiDict: return False else: self.wikiDict[firstLine] = "" return True class WikiWriter: def __init__(self,output): self.outFile = open(output,'w') def writeFile(self,firstLine,wholeFile): if wholeFile: #print "Writing "+ firstLine self.outFile.write(''+firstLine+'') self.outFile.write(wholeFile) self.outFile.write('\n
\n') def closeFile(self): self.outFile.close() def checkSyntax(input,output): if output == None or input == None: return False if os.path.isdir(input) and not os.path.isdir(output): return True else: return False def main(): parser = OptionParser() parser.add_option("-i", "--input", dest="input", help="Input folder where Wikipedia Source files are stored", metavar="Input Folder") parser.add_option("-o", "--output", dest="output", help="Output file where cleansed Wikipedia Source files will be stored", metavar="Output File") parser.add_option("-n", "--noredirects", dest="redirects", action ="store_false", help="No connection to local Wikipedia Reader, therefore no redirect processing.", metavar="redirects") options, args = parser.parse_args() input = options.input output = options.output redirect = options.redirects if checkSyntax(input,output): pass else: sys.stdout.write("\nERROR WITH THE INPUT/OUTPUT FILES\n") sys.exit(0) wikiReader = WikiReader() wikiProcess = WikiProcessor(redirect) wikiWriter = WikiWriter(output) adjustName = AdjustName() input = adjustName.removeSlash(input) fileList = wikiReader.listFiles(input) for file in fileList: fileName = input+'/'+file if os.path.isfile(fileName): #test utf-8 #codecs.open( "someFile", "r", "utf-8" ) #fileObject = open(input+'/'+file,'r','utf-8') fileObject = open(input+'/'+file,'r') firstLine = wikiReader.readFirstLine(fileObject) wholeFile = wikiReader.readFile(fileObject) #print fileName #print len(wholeFile) firstLine, wholeFile = wikiProcess.processFile(firstLine,wholeFile) if firstLine: #returns false if article already in wikiWriter.writeFile(firstLine,wholeFile) fileObject.close() wikiWriter.closeFile() if __name__ == '__main__': main()