import re,urllib,os, shutil, sys,regex file = sys.argv[1] urllist = open(file,'r') class CopyCorpus: def __init__(self,title=''): self.title = title self.titleDict = {} self.resultFile = '/var/tmp/result' self.fileObject = '' self.file = '' self.num = 1 def openFile(self): self.fileObject = open(self.resultFile,'r') self.title = self.fileObject.readline().rstrip() self.file = self.fileObject.read() def closeFile(self): self.fileObject.close() def redirectCheck(self): # if the article contains a redirect link, the redirect URL will be used to #print result num = 0 found = 0 while num < 5: titleGroup = regex.regredirect.search(self.file) if titleGroup: #retrive the correct article title = titleGroup.group(1) #print result #print title url = 'http://127.0.0.1:8000/article/'+title #address must correspond with local Wikipedia url = re.sub(r' ','_',url) #print url print "WILL TRY: "+url page = urllib.urlopen(url) page.close() num = num + 1 self.closeFile() self.openFile() #self.getName() #self.getFile() else: found = 1 break if found == 0: return False else: return True def testIfApplicable(self): if len(self.file) < 2000: #print self.file return False if self.title in self.titleDict: return False else: #print len(self.file) return True def copyCorpus(self): #result = '/var/tmp/result' num = self.num newfilepath = '/home/gisle/phd/wikipedia/corpustest/' if num < 10: filler = '00' elif num > 9 and num < 100: filler = '0' elif num >= 100: filler = '' #filler = '0' artNum = str(filler)+str(num) title = re.sub(r'/','', self.title) filename = newfilepath + str(artNum) + "." + str(title) + ".source" if not os.path.isfile(filename): if self.testIfApplicable(): shutil.copy(self.resultFile,filename) self.titleDict[self.title] = '' self.num = self.num + 1 def main(): cc = CopyCorpus() while True: urlline = urllist.readline() if not urlline: urllist.close() break url = re.sub(r'<[^>]+>','',urlline).rstrip() page = urllib.urlopen(url) page.close() print url cc.openFile() if cc.redirectCheck(): cc.copyCorpus() cc.closeFile() if __name__ == '__main__': main()