# -*- coding: utf-8 -*- import re regtitlepreprocess = re.compile(r'
([^<]*)
',re.MULTILINE | re.DOTALL) regdeflist = re.compile(r'(\n;[^:\n]*:)') regsentinitialbracket = re.compile(r'([^M][^rs]\.)\s+(\[\[([^|\]]+\|)?([A-Z])([^|\]]+)\]\])',re.MULTILINE | re.DOTALL) regneosfirst = re.compile(r'()+\s*()+\s*()*\s?', re.MULTILINE | re.DOTALL) regneosfinal = re.compile(r'()*\s*()+\s*()+\s?', re.MULTILINE | re.DOTALL) regneosremove = re.compile(r'', re.MULTILINE | re.DOTALL) regeos = re.compile(r'\s?', re.MULTILINE | re.DOTALL) regeosTwoOrMore = re.compile(r'((\s*)|(\s?\n)){3-10}', re.MULTILINE | re.DOTALL) regredirect = re.compile(r'#[rR][eE][dD][iI][rR][eE][cC][tT]\s?\[\[([^\]]+)\]\]', re.MULTILINE | re.DOTALL ) regcurly1 = re.compile(r'(\({{[^}^{]*?}}\)|\'*{{[^}^{]*?}}\'*)', re.MULTILINE | re.DOTALL) reglongTemp = re.compile(r'{{[^}^{]{500,10000000}}}\)|\'*{{[^}^{]{500,10000000}}}', re.MULTILINE | re.DOTALL) regboxtable = re.compile(r'{{[^}^{]*?(box|table)[^}^{]*}}', re.MULTILINE | re.DOTALL) regwikitable1 = re.compile(r'{[^}]*?class="?wikitable"?[^}]*?}', re.MULTILINE | re.DOTALL) regwikitable2 = re.compile(r'[^{]{\|[^\\][^}^{]*?}', re.MULTILINE | re.DOTALL) regwikitable3 = re.compile(r'([^{]|^){\|[^\\].{0,10000}?\|}', re.MULTILINE | re.DOTALL) regtable = re.compile(r'<\s?table[^/]{0,100}>.{0,30000}', re.MULTILINE | re.DOTALL) regtableborder = re.compile(r'<\s?TABLE[^/]{0,100}>.{0,30000}', re.MULTILINE | re.DOTALL) #regtableConvertStart = re.compile(r'{\|', re.MULTILINE | re.DOTALL) #regtableConvertEnd = re.compile(r'\|}', re.MULTILINE | re.DOTALL) #regtableStephan = re.compile(r'\{\|(?:(?!\{\|)(?!\|\}).)+?\|\}') """ regtableStephanImproved = re.compile(r'{\|(?:(?!{\|)(?!\|}).)+?\|}', re.MULTILINE | re.DOTALL) regtableStephanImproved2 = re.compile(r'^[:*]{0,4}{\|(?:(?!^[:*]{0,4}\|}).)+^[:*]{0,4}\|}', re.MULTILINE | re.DOTALL) regtableStephanImproved3 = re.compile(r'^[:* ]{0,4}{\|(?:(?!^[:* ]{0,4}(?:{\||\|})).)+?^[:* ]{0,4}\|}', re.MULTILINE | re.DOTALL) """ regtableClean = re.compile(r'(^[:* ]{0,4}|^(<[^>]*?>)\s*(<[^>]*?>)*\s*){\|(?:(?!^[:* ]{0,4}(?:{\||\|})).)+?^[:* ]{0,4}\|}', re.MULTILINE | re.DOTALL) regtablestart = re.compile(r'(^[:* ]{0,4}|^(<[^>]*?>)\s*(<[^>]*?>)*\s*){\|') #regtableGisleImproved = re.compile(r'^[:* ]{0,4}{\|(?:(?!^[:* ]{0,4}\|}).)(?:(?!^[:* ]{0,4}{\|).)+?^[:* ]{0,4}\|}', re.MULTILINE | re.DOTALL) regmultiplePipes = re.compile(r'(\|.[^\|]*){20,100000}') #regtableStephan = re.compile(r'{\|(?:(?!{\|).)+\|}') #DEPRECATED regtableConvertStart = re.compile(r'(^.)?{\|(?=([^\\].{0,400000}?\|}))', re.MULTILINE | re.DOTALL) regtableConvertEnd = re.compile(r'(Ӂ.{0,400000}?)(\|})', re.MULTILINE | re.DOTALL) regtableConvertRevertStart = re.compile(r'Ӂ', re.MULTILINE | re.DOTALL) regtableConvertRevertEnd = re.compile(r'ጣ', re.MULTILINE | re.DOTALL) regtableConverted = re.compile(r'Ӂ[^Ӂ]{0,400000}?ጣ', re.MULTILINE | re.DOTALL) regtablehardcode = re.compile('{\|\sclass=\"wikitable\"\s!English\s\|\|\sFrench\s\|\|\sIPA pronunciation\s\(Canadian accent\).*?\|}', re.MULTILINE | re.DOTALL) regeos = re.compile(r'\s?', re.MULTILINE | re.DOTALL) regeosmathnowiki = re.compile(r'', re.MULTILINE | re.DOTALL) regbr = re.compile(r'', re.MULTILINE | re.DOTALL) regref = re.compile(r'<[Rr]ef.*?/[Rr]ef>', re.MULTILINE | re.DOTALL) regref2 = re.compile(r'<[Rr]ef[^>]*?/>', re.MULTILINE | re.DOTALL) regcomment = re.compile(r'', re.MULTILINE | re.DOTALL) regblockquote = re.compile(r'(.*?)',re.MULTILINE | re.DOTALL) regsource = re.compile(r'',re.MULTILINE | re.DOTALL) regdiv2 = re.compile(r']*?>',re.MULTILINE | re.DOTALL) reggallery = re.compile(r'',re.MULTILINE | re.DOTALL) regsingleast = re.compile(r'^\*$') reghyphen = re.compile(r'----?',re.MULTILINE | re.DOTALL) regcenter = re.compile(r'',re.MULTILINE | re.DOTALL) regsealso = re.compile(r'==+\s?See also\s?==.*', re.MULTILINE | re.DOTALL) regnotes = re.compile(r'==+\s?Notes\s?.*', re.MULTILINE | re.DOTALL) regreferences = re.compile(r'==+\s?References.*', re.MULTILINE | re.DOTALL) regsources = re.compile(r'==+\s?Sources.*', re.MULTILINE | re.DOTALL) regsourcelookahead = re.compile(r'==+\s?(?=Sources.*?==+\s?Sources)', re.MULTILINE | re.DOTALL) regseealsolookahead = re.compile(r'==+\s?(?=See also.*?==+\s?See also)', re.MULTILINE | re.DOTALL) regnoteslookahead = re.compile(r'==+\s?(?=Notes.*?==+\s?Notes)', re.MULTILINE | re.DOTALL) regreflookahead = re.compile(r'==+\s?(?=References.*?==+\s?References)', re.MULTILINE | re.DOTALL) regsourcelookaheadrestore = re.compile(r'___(Sources)(.*?==+)',re.MULTILINE | re.DOTALL) regnoteslookaheadrestore = re.compile(r'___(Notes)(.*?==+)',re.MULTILINE | re.DOTALL) regreflookaheadrestore = re.compile(r'___(References)(.*?==+)',re.MULTILINE | re.DOTALL) regseealsolookaheadrestore = re.compile(r'___(See also)(.*?==+)',re.MULTILINE | re.DOTALL) regbibliography = re.compile(r'==+\s?Bibliography\s?.*', re.MULTILINE | re.DOTALL) regfootnotes = re.compile(r'==+\s?Footnotes\s?.*', re.MULTILINE | re.DOTALL) regrelated = re.compile(r'==+\s?Related web sites\s?.*', re.MULTILINE | re.DOTALL) regexternal = re.compile(r'==+\s?External links\s?.*', re.MULTILINE | re.DOTALL) regmath = re.compile(r'[^\>]+?',re.MULTILINE | re.DOTALL) regremovenewline = re.compile(r'(?=\n)\n',re.MULTILINE | re.DOTALL) regparagraph = re.compile(r'(\n\n)',re.MULTILINE | re.DOTALL) regtitle = re.compile(r'(==+?\s?[^=]+?\s?==+\s?)',re.MULTILINE | re.DOTALL) regbullets = re.compile(r'(^[#*].*?)\n', re.MULTILINE | re.DOTALL) regbulletscolon = re.compile(r'(?<=([\.:]\s))([*#].*?)\n') regbullets2 = re.compile(r'(^\*.*?)\n', re.MULTILINE | re.DOTALL) regindentcolon = re.compile(r'(^[;:][:;]?.*?)\n', re.MULTILINE | re.DOTALL) regcolon = re.compile(r'^:', re.MULTILINE | re.DOTALL) regimage = re.compile(r'\[\[Image:[^\[\]]*(\[\[[^\]]*\]\][^\[\]]*)*?\]\]',re.MULTILINE | re.DOTALL) regspan = re.compile(r'',re.MULTILINE | re.DOTALL) regletternumber = re.compile(r'[A-Za-z0-9]',re.MULTILINE | re.DOTALL) regonlyXML = re.compile(r'^(<[^>]*?>)\s*(<[^>]*?>)*\s*$',re.MULTILINE | re.DOTALL) regbracket = re.compile(r'(\[\[([^|\]]+\|)?([^|\]]+)\]\]s\.)',re.MULTILINE | re.DOTALL) regjava = re.compile(r'{{(Javadoc:.*?)}}',re.MULTILINE | re.DOTALL) regiast = re.compile(r'{{(IAST.*?)}}',re.MULTILINE | re.DOTALL) regipa = re.compile(r'{{(IPA.*?)}}(?=[^}])', re.MULTILINE | re.DOTALL) regyeareos = re.compile(r'([0-9]{2,4}s\.)', re.MULTILINE | re.DOTALL) regorg = re.compile(r'(\.org\.\s)', re.MULTILINE | re.DOTALL) ##japanese """ regtransjap = re.compile(r'{{transl\|ja\|([^}]*?)}}', re.MULTILINE | re.DOTALL) reglangjap = re.compile(r'{{lang\|ja\|([^}]*?)}}', re.MULTILINE | re.DOTALL) regnihongohardcode = re.compile(r'({{Nihongo\|[^|]+\|[^|]+\|)3=',re.MULTILINE | re.DOTALL) regnihongojap5 = re.compile(r'{{[Nn]ihongo\|([^|}]+)\|([^|}]+)\|([^|}]+)\|([^|}]+)\|([^|}]+)}}', re.MULTILINE | re.DOTALL) regnihongojap4 = re.compile(r'{{[Nn]ihongo\|([^|}]+)\|([^|}]+)\|([^|}]+)\|([^|}]+)}}', re.MULTILINE | re.DOTALL) regnihongojap3 = re.compile(r'{{[Nn]ihongo\|([^|}]*?)\|([^|}]*?)\|([^|}]+)}}', re.MULTILINE | re.DOTALL) regnihongojap2 = re.compile(r'{{[Nn]ihongo\|([^|}]+)\|([^|}]+)\|?}}', re.MULTILINE | re.DOTALL) #general reglanggeneral = re.compile(r'{{lang\|[^|]+\|([^}]*?)}}', re.MULTILINE | re.DOTALL) regtransgeneral = re.compile(r'{{transl\|[^|]+\|([^}]*?)}}', re.MULTILINE | re.DOTALL) """ #preservetemplate reglanggeneralpreserve = re.compile(r'{{(lang\|.*?)}}', re.MULTILINE | re.DOTALL) regtransgeneralpreserve = re.compile(r'{{(transl\|.*?)}}', re.MULTILINE | re.DOTALL) regnihongopreservere = re.compile(r'{{([Nn]ihongo\|.*?)}}', re.MULTILINE | re.DOTALL) #harvard regharv_general = re.compile(r'{{([Hh]arv.*?)}}', re.MULTILINE | re.DOTALL) regaudio_general = re.compile(r'{{([Aa]udio.*?)}}', re.MULTILINE | re.DOTALL) regflagtemplate = re.compile(r'{{([Ff]lag\s?\|.*?)}}', re.MULTILINE | re.DOTALL) """ USE IF TEMPLATE EXPANSION regharvtxt_aut_year_page = re.compile(r'{{[Hh]arvtxt\s?\|\s?([^\|]+)\|\s?([0-9][^\|]+)\|\s?((pp?|loc)=[^\}]+)}}', re.MULTILINE | re.DOTALL) regharvtxt_aut_year = re.compile(r'{{[Hh]arvtxt\s?\|\s?([^\|]+)\|\s?([0-9][^\|]+)}}', re.MULTILINE | re.DOTALL) regharv_aut_year_page = re.compile(r'{{[Hh]arv\s?\|\s?([^\|]+)\|\s?([0-9][^\|]+)\|\s?((pp?|loc)=[^\}]+)}}', re.MULTILINE | re.DOTALL) regharvtxt_aut_aut_year_page = re.compile(r'{{[Hh]arvtxt\s?\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([0-9][^\|]+)\|\s?((pp?|loc)=[^\}]+)}}', re.MULTILINE | re.DOTALL) regharv_aut_aut_year_page = re.compile(r'{{[Hh]arv\s?\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([0-9][^\|]+)\|\s?((pp?|loc)=[^\}]+)}}', re.MULTILINE | re.DOTALL) regharvtxt_aut_aut_year = re.compile(r'{{[Hh]arvtxt\s?\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([0-9][^\|]+)}}', re.MULTILINE | re.DOTALL) regharv_aut_aut_year = re.compile(r'{{[Hh]arv\s?\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([0-9][^\|]+)}}', re.MULTILINE | re.DOTALL) regharvnb_aut_year_page_nb = re.compile(r'{{[Hh]arvnb\s?\|\s?([^\|]+)\|\s?([0-9][^\|]+)\|\s?((pp?|loc)=[^\}]+)}}', re.MULTILINE | re.DOTALL) regharvnb_aut_aut_year_page_nb = re.compile(r'{{[Hh]arvnb\s?\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([0-9][^\|]+)\|\s?((pp?|loc)=[^\}]+)}}', re.MULTILINE | re.DOTALL) regharvnb_aut_aut_year_nb = re.compile(r'{{[Hh]arvnb\s?\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([0-9][^\|]+)}}', re.MULTILINE | re.DOTALL) regharvnb_aut_year_nb = re.compile(r'{{[Hh]arvnb\s?\|\s?([^\|]+)\|\s?([0-9][^\|]+)}}', re.MULTILINE | re.DOTALL) regharvnb_aut_aut_aut_year_page_nb = re.compile(r'{{[Hh]arvnb\s?\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([0-9][^\|]+)\|\s?((pp?|loc)=[^\}]+)}}', re.MULTILINE | re.DOTALL) regharvnb_aut_aut_aut_year_nb = re.compile(r'{{[Hh]arvnb\s?\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([^\|]+)\|\s?([0-9][^\|]+)}}', re.MULTILINE | re.DOTALL) regharvcoltxt_aut_year_page = re.compile(r'{{[Hh]arvcoltxt\s?\|\s?([^\|]+)\|\s?([0-9][^\|]+)\|\s?((pp?|loc)=[^\}]+)}}', re.MULTILINE | re.DOTALL) """ regbacktocurly1 = re.compile(r'<___', re.MULTILINE | re.DOTALL) regbacktocurly2 = re.compile(r'___>', re.MULTILINE | re.DOTALL) #original: could become exponential #regremoveeosinmath = re.compile(r'((.*)*?(.*?)', re.MULTILINE | re.DOTALL) #regremoveeosinmath = re.compile(r'(.{0,500})(.{0,500}){0,30}}(.{0,500})', re.MULTILINE | re.DOTALL) #TRY AGAIN: #make greedy! regremoveeosinmath = re.compile(r'([^<]{0,500}?)()(.{0,500}?)', re.MULTILINE | re.DOTALL) regremoveeosinsource = re.compile(r'((.*)*?(.*?)', re.MULTILINE | re.DOTALL) #regremoveeosinsource2 = re.compile(r'(.{0,5000}?)', re.MULTILINE | re.DOTALL) #regremoveeosincode2 = re.compile(r'(.{0,5000}?)', re.MULTILINE | re.DOTALL) #TRYING THE OLD regremoveeosinsource2 = re.compile(r'()', re.MULTILINE | re.DOTALL) regremoveeosincode2 = re.compile(r'()', re.MULTILINE | re.DOTALL) regremoveeosincode = re.compile(r'((.*)*?(.*?)', re.MULTILINE | re.DOTALL) regremoveeosinmath2 = re.compile(r'(.{0,5000}?)', re.MULTILINE | re.DOTALL) regremoveeosinnowiki2 = re.compile(r'(.{0,5000}?)', re.MULTILINE | re.DOTALL) #ORIGINAL: #regremoveeosinnowiki = re.compile(r'((.*)*?(.*?)', re.MULTILINE | re.DOTALL) #not multiline/dotall #regremoveeosinnowiki = re.compile(r'((.*)*?(.*?)') #not multiline/dotall - not tag #regremoveeosinnowiki = re.compile(r'(.*)(.*)*?(.*?)') regremovenowiki = re.compile(r'') #length constraint #regremoveeosinnowiki = re.compile(r'((.{0,500})*?(.{0,500})', re.MULTILINE | re.DOTALL) #length constraint - no multiline #regremoveeosinnowiki = re.compile(r'((.{0,500})*?(.{0,500})') #regremoveeosinnowiki = re.compile(r'(.{0,500})(.{0,500}){0,30}(.{0,500})') #multiple: - greedy regremoveeosinnowiki = re.compile(r'([^<]{0,500}?)()(.{0,500}?)', re.MULTILINE | re.DOTALL) #regdigitandbold = re.compile(r'(^:"[0-9][0-9]\.\s+(?=\'))', re.MULTILINE | re.DOTALL) #regcurly1 = re.compile(r'{{?[^}^{]*?}?}', re.MULTILINE | re.DOTALL) #regsource1 = re.compile(r'