# -*- coding: utf-8 -*- import codecs import os import string import sys import unicodedata GML_TAG_EMPTY = u'⌊{0}⌋' GML_TAG_OPEN = u'⌊{0}' GML_TAG_CLOSE = u'{0}⌋' ESCAPE = u'\\' ESCAPED_ESCAPE = ESCAPE * 2 CONTROL_CHARS = [unichr(c) for c in range(0x20)] WHITESPACE = set() for i in xrange(sys.maxunicode + 1): char = unichr(i) if unicodedata.category(char).startswith('Z'): WHITESPACE.add(char) for char in string.whitespace: WHITESPACE.add(char) def escape(string): chars = list() for char in string: if char in CONTROL_CHARS: chars.append(char.encode('unicode_escape')) else: chars.append(char) return u''.join(chars) def extract_element(html, pattern): match = pattern.search(html) if match is None: raise Exception tag_name = None closures_needed = 1 start = match.start() end = start while end < len(html): end += 1 if tag_name is None: if html[end] in ' >': tag_name = html[start + 1:end] open_tag = u'<{0}'.format(tag_name) close_tag = u''.format(tag_name) elif substring_match(html, open_tag, end): closures_needed += 1 elif substring_match(html, close_tag, end): end += len(close_tag) closures_needed -= 1 if closures_needed is 0: break return html[start:end] def is_escaped(string, i): return i >= 1 and string[i - 1] == ESCAPE and \ (i == 1 or string[i - 2] != ESCAPE) def is_whitespace(string): for char in string: if char not in WHITESPACE: return False return True def open(path, mode='r', encoding='utf-8'): return codecs.open(path, mode, encoding) def read(path): istream = open(path) contents = istream.read() istream.close() return contents def substring_match(string, sub, i): return string[i:i + len(sub)] == sub def unescape(string): chars = list() string = string.replace(r'\n', u'\n') for char in string: if char.startswith(r'\x'): chars.append(eval(char)) else: chars.append(char) return u''.join(chars)