import re,sys, os, cPickle, codecs, string """ to find breakpoints, use: grep -b "" enwiki-dump > pagebreaks.txt then: cat pagebreaks.txt | python findBreakPoints.py then: python wikidumpArticleparser2.py enwiki-dump """ titleReg = re.compile(r'([^<]+)<\/title>', re.MULTILINE | re.DOTALL) #articleReg = re.compile(r'<text xml:space="preserve">(.*?)</text>', re.MULTILINE | re.DOTALL) articleReg = re.compile(r'<text xml:space="preserve">(.*?)</text>|<text xml:space="preserve"\s?/>', re.MULTILINE | re.DOTALL) def replaceEntities(input): input = input.replace('<','<') input = input.replace('>','>') input = input.replace(''',"'") input = input.replace('"','"') input = input.replace('&','&') return input def removetitledash(input): input = input.replace('/','_') return input def folderIncrement(int): stringInt = str(int) while True: if len(stringInt) < 8: stringInt = '0'+stringInt else: return stringInt def parseWiki(file,folderName,weScienceDict,moveArticles): f = file dump = f.read() titleIter = re.finditer(titleReg,dump.encode("utf-8")) articleIter = re.finditer(articleReg,dump.encode("utf-8")) i = 1 throwOutFolder = 'throwout/' while True: try: articleName = titleIter.next().group(1) articleContent = articleIter.next().group(1) if articleContent: articleContent = replaceEntities(articleContent) else: articleContent = '' if articleName: articleName = removetitledash(articleName) articleName = replaceEntities(articleName) else: articleName = '' if len(articleContent) > 2000: if not articleName in weScienceDict: if checkIfMoving(articleName,moveArticles): newArticle = open(folderName+articleName[:100],'w') newArticle.write(articleName+'\n') newArticle.write(articleContent) newArticle.write('\n') newArticle.close() if (i%5000 == 0): print("Finished parsing %s articles" % i) i = i + 1 else: newArticle = open(throwOutFolder+articleName[:100],'w') newArticle.write(articleName+'\n') newArticle.write(articleContent) newArticle.write('\n') newArticle.close() except StopIteration: print "FINISHED, EXTRACTED %s ARTICLES" % i break def checkIfMoving(artName,moveList): for m in moveList: if artName.startswith(m): return False return True def main(): if len(sys.argv) < 2: print("Provide wikidump name") sys.exit(0) #f = open(sys.argv[-1],'r') weDict = open('wescienceArticles.py','r') weScienceDict = cPickle.load(weDict) moveArticles = ['MediaWiki:','Category:', 'Help:', 'Image:', 'Portal:', 'Template:', 'Wikipedia:'] print "Parsing file %s" % str(sys.argv[-1]) f = codecs.open(sys.argv[-1],'r', "utf-8") folderName = 'output/' parseWiki(f,folderName,weScienceDict,moveArticles) f.close() if __name__ == '__main__': main()