import re,sys, os, cPickle, codecs

def folderIncrement(int):
	stringInt = str(int)
	while True:
		if len(stringInt) < 5:
			stringInt = '0'+stringInt
		else:
			return stringInt


def fileIncrement(int):
	stringInt = str(int)
	while True:
		if len(stringInt) < 7:
			stringInt = '0'+stringInt
		else:
			return stringInt



def main():
	if len(sys.argv) < 2:
		print("Provide wikidump folder")
		sys.exit(0)
	outputFolder = sys.argv[-1]
	print "LISTING FILES"
	folderList = os.listdir(outputFolder)
	print "SORTING FILES"
	folderList.sort()
	folderName = 100
	fileNum = 100
	folderPath = 'raw/'+folderIncrement(folderName)
	#if not os.path.exists(folderPath):
	#			os.mkdir(folderPath)
	print "PROCESSING..."
	
	
	for i in range(len(folderList)):
		if fileNum%100 == 0:
			print "processing file number %s" % str(fileNum)
			folderName = folderName + 1 
			folderPath = 'raw/'+folderIncrement(folderName)
			if not os.path.exists(folderPath):
				os.mkdir(folderPath)
		articleFile = open(outputFolder+'/'+folderList[i],'r')
		newArticle = open(folderPath+"/"+fileIncrement(fileNum)+"."+folderList[i],'w')
		newArticle.write(articleFile.read())
		articleFile.close()
		newArticle.close()
		fileNum = fileNum + 1
		
	
	

if __name__ == '__main__':
    main()