import re,sys, os, cPickle, codecs def folderIncrement(int): stringInt = str(int) while True: if len(stringInt) < 5: stringInt = '0'+stringInt else: return stringInt def fileIncrement(int): stringInt = str(int) while True: if len(stringInt) < 7: stringInt = '0'+stringInt else: return stringInt def main(): if len(sys.argv) < 2: print("Provide wikidump folder") sys.exit(0) outputFolder = sys.argv[-1] print "LISTING FILES" folderList = os.listdir(outputFolder) print "SORTING FILES" folderList.sort() folderName = 100 fileNum = 100 folderPath = 'raw/'+folderIncrement(folderName) #if not os.path.exists(folderPath): # os.mkdir(folderPath) print "PROCESSING..." for i in range(len(folderList)): if fileNum%100 == 0: print "processing file number %s" % str(fileNum) folderName = folderName + 1 folderPath = 'raw/'+folderIncrement(folderName) if not os.path.exists(folderPath): os.mkdir(folderPath) articleFile = open(outputFolder+'/'+folderList[i],'r') newArticle = open(folderPath+"/"+fileIncrement(fileNum)+"."+folderList[i],'w') newArticle.write(articleFile.read()) articleFile.close() newArticle.close() fileNum = fileNum + 1 if __name__ == '__main__': main()