import re,sys from optparse import OptionParser from ccp import WikiReader import difflib def textInLines(file): f = open(file,'r') lines = f.readlines() f.close() return lines def dictFiles(svnFile,ccFile,svn_folder,cc_folder): svnDict = {} ccDict = {} svnFileLines = textInLines(svn_folder+svnFile) ccFileLines = textInLines(cc_folder+svnFile) #print svnFileLines for line in svnFileLines: #print line if len(line) > 10: svnDict[int(line[1:9])] = line[11:] for line in ccFileLines: if len(line) > 10: ccDict[int(line[1:9])] = line[11:] #print svnDict.keys() sortedSVNDictKeys = sorted(svnDict.iterkeys()) sortedccDictKeys = sorted(ccDict.iterkeys()) return svnDict,ccDict,sortedSVNDictKeys,sortedccDictKeys def makeSimilarOutput(key,string): return '['+str(key)+']SIMILAR'+string def makeNotFixedOutput(key,string): return '['+str(key)+']CHECKWITHSVN'+string def makeFixedOutput(key,string): return '['+str(key)+']*'+string def makeOutput(key,string): return '['+str(key)+'] '+string def checkFiles2(svnDict,ccDict,sortedSVNDictKeys,sortedccDictKeys): toOutput = '' key = sortedccDictKeys[0] #for i in range(len(sortedSVNDictKeys)): for i in range(len(sortedccDictKeys)): resolved = False oldKey = key key = sortedccDictKeys[i] #if (key - oldKey) > 10: #print key if str(key)[-1] != str(0): #print "PASSING" resolved = True pass if svnDict.get(key) == ccDict.get(key) and resolved != True: #print svnDict.get(key) toOutput += makeOutput(key,ccDict.get(key)) resolved = True """ if not sortedSVNDictKeys[i] == sortedSVNDictKeys[-1] and resolved != True: if str(sortedSVNDictKeys[i+1])[-1] != str(0): nextSVNkey = sortedSVNDictKeys[i+1] #print nextSVNkey match = difflib.SequenceMatcher(None,svnDict.get(key)+svnDict.get(nextSVNkey), ccDict.get(key)) if match.ratio() > 0.99: #print match.ratio() #print str(key) + " " + svnDict.get(key) #print str(nextSVNkey) + " " + svnDict.get(nextSVNkey) toOutput += makeFixedOutput(key,svnDict.get(key)) toOutput += makeFixedOutput(nextSVNkey,svnDict.get(nextSVNkey)) sys.stdout.write('SEPARATING LINE %s AND %s - FIXING\n' % (str(key),str(nextSVNkey))) resolved = True """ if svnDict.get(key) and ccDict.get(key) and resolved != True: match = difflib.SequenceMatcher(None,svnDict.get(key), ccDict.get(key)) if match.ratio() > 0.90: sys.stdout.write("KEY %s IN SVN AND KEY %s IN CC IS SIMILAR (%s) - NOT FIXING \n" % (str(key),str(key),match.ratio())) toOutput += makeSimilarOutput(key,ccDict.get(key)) resolved = True #print "trying" if resolved != True: #print "trying 2" svni = i - 30 for j in range(60): svni += 1 #sys.stdout.write(str(cci)+"\n") if svni > 0 and len(sortedSVNDictKeys) > svni: svnKey = sortedSVNDictKeys[svni] #sys.stdout.write(str(ccKey)+"\n") if svnDict.get(svnKey): #sys.stdout.write(ccKey+"\n") #sys.stdout.write(ccDict.get(ccKey)) match = difflib.SequenceMatcher(None,svnDict.get(svnKey), ccDict.get(key)) matchratio = match.ratio() if matchratio == 1: sys.stdout.write("KEY %s IN SVN AND KEY %s IN CC IS IDENTICAL (%s) - FIXING\n" % (str(svnKey),str(key),matchratio)) #print str(key) + " " + ccDict.get(ccKey) #toOutput += makeFixedOutput(key,ccDict.get(ccKey)) toOutput += makeOutput(svnKey,ccDict.get(key)) resolved = True break elif matchratio > 0.90: sys.stdout.write("KEY %s IN SVN AND KEY %s IN CC IS SIMILAR (%s) - NOT FIXING\n" % (str(svnKey),str(key),matchratio)) toOutput += makeSimilarOutput(svnKey,ccDict.get(key)) resolved = True break elif str(svnKey)[-1] != str(0): lastSVNKey = sortedSVNDictKeys[svni-1] reconstructed = svnDict.get(lastSVNKey)+svnDict.get(svnKey) nextSVNindex = svni + 1 """ while True: if str(sortedSVNDictKeys[nextSVNindex]) != str(0): reconstructed = reconstructed + svnDict.get(sortedSVNDictKeys[nextSVNindex]) nextSVNindex += 1 else: break """ match = difflib.SequenceMatcher(None,reconstructed, ccDict.get(key)) if match.ratio() > 0.99: #print match.ratio() #print str(key) + " " + svnDict.get(key) #print str(nextSVNkey) + " " + svnDict.get(nextSVNkey) toOutput += makeFixedOutput(lastSVNKey,svnDict.get(lastSVNKey)) toOutput += makeFixedOutput(svnKey,svnDict.get(svnKey)) sys.stdout.write('SEPARATING LINE %s AND %s - FIXING*\n' % (str(key),str(svnKey))) resolved = True if not resolved: sys.stdout.write("CANNOT ALIGN KEY %s IN CC-PIPELINE - NOT FIXING\n" % (str(key))) toOutput += makeNotFixedOutput(key,ccDict.get(key)) return toOutput def testDicts(svnDict,ccDict): for line in svnDict: if svnDict.get(line) == ccDict.get(line): pass #print svnDict.get(line) else: pass def main(): parser = OptionParser() parser.add_option("-s", "--SVN-Folder", dest="svn", help="SVN Corpus Folder", metavar="SVN Corpus Folder") parser.add_option("-c", "--CC-Folder", dest="cc", help="Corpus Clean Folder", metavar="Corpus Clean Folder") parser.add_option("-o", "--Output-Folder", dest="out", help="Output Folder", metavar="Output Folder") options, args = parser.parse_args() cc_folder = options.cc svn_folder = options.svn output_folder = options.out wikiReader = WikiReader() svnFiles = wikiReader.listFiles(svn_folder) #print svnFiles ccFiles = wikiReader.listFiles(cc_folder) for i in range(len(svnFiles)): #if svnFiles[i] == ccFiles[i] and ccFiles[i] != '01.txt': if svnFiles[i] == ccFiles[i]: #checkFiles(svnFiles[i],ccFiles[i],svn_folder,cc_folder) svnDict,ccDict,sortedSVNDictKeys,sortedccDictKeys = dictFiles(svnFiles[i],ccFiles[i],svn_folder,cc_folder) file = open(output_folder+svnFiles[i],'w') file.write(checkFiles2(svnDict,ccDict,sortedSVNDictKeys,sortedccDictKeys)) file.close() if __name__ == '__main__': main()