;;;-*- Mode: Lisp; Package: ENCODING -*- ;; ;; Copyright (C) Paul Meurer 2001-2004. All rights reserved. ;; paul.meurer@hit.uib.no ;; HIT-centre, University of Bergen ;; ;; utf-8 conversion ;; (in-package :encoding) (defvar *utf-8-table* (make-hash-table)) ;; returns number of chars (= bytes) written (defun write-utf-8-encoded (string stream &key (start 0) (end (length (or string "")))) (declare (optimize (speed 3)) (fixnum start end)) (when string (let ((count 0)) (labels ((encode (string start pos) (declare (fixnum start pos)) (cond ((= pos end) (incf count (- end start)) (write-string string stream :start start :end end)) (t (let ((code (char-code (char string pos)))) (cond ((< code #x80) (encode string start (1+ pos))) (t (let ((utf-8 (gethash code *utf-8-table*))) (cond (utf-8 (let ((utf-8 (gethash code *utf-8-table*))) (incf count (+ (- pos start) (length utf-8))) (write-string string stream :start start :end pos) (write-string utf-8 stream) (encode string (the fixnum (1+ pos)) (the fixnum (1+ pos))))) ((> code 128) (write-string string stream :start start :end pos) (incf count (+ (- pos start) (write-unicode-to-utf-8 code stream))) (encode string (the fixnum (1+ pos)) (the fixnum (1+ pos)))) (t ;; char not found; should be an error (encode string start (1+ pos)))))))))))) (encode string start start) count)))) (defun utf-8-encode (string) (let ((length nil)) (values (with-output-to-string #-lispworks (stream) #+lispworks (stream nil :element-type 'lispworks::simple-char) (setf length (write-utf-8-encoded (or string "") stream))) length))) (defun entity-to-utf-8 (entity) (with-output-to-string (stream) (let ((code (parse-integer entity :start 3 :junk-allowed t :radix 16))) (write-unicode-to-utf-8 code stream)))) #+test (print (utf-8-decode "&sted;" t)) (defun utf-8-decode (string &optional resolve-entities-p external-entities) #+debug(print string) (when string (let ((length (length string)) (pos 0)) (with-output-to-string #-lispworks (stream) #+lispworks (stream nil :element-type 'lispworks::simple-char) (labels ((decode-one (code size) (incf pos) (cond ((zerop size) (write-char (code-char code) stream)) ((>= pos length) (error "The string ~s does not seem to be UTF-8." string)) (t (decf size 6) (decode-one (+ code (ash (logand #b00111111 (char-code (char string pos))) size)) size)))) (decode () (cond ((< pos length) (let* ((char (char string pos)) (code (char-code char))) (cond ((and resolve-entities-p (char= char #\&)) (let ((ent-end (position #\; string :start (1+ pos)))) (when (null ent-end) (error "utf-8-decode(): Could not find entity end marker ';' in ~s." string)) ;; missing: hex entities &x...; (let ((ent (subseq string (1+ pos) ent-end))) (incf pos (- ent-end pos -1)) (let ((char (entity-to-char ent))) (cond (char (write-char char stream)) (t (let ((ent-res (cadr (member ent external-entities :test #'string=)))) (cond (ent-res (write-string ent-res stream)) (t ;; if entity can't be resolved write plain entity (write-char #\& stream) (write-string ent stream) (write-char #\; stream)))))))))) ((zerop (logand #b10000000 code)) (incf pos) (write-char char stream)) ((= (logand #b11100000 code) #b11000000) (decode-one (ash (logand #b00011111 code) 6) 6)) ((= (logand #b11110000 code) #b11100000) (decode-one (ash (logand #b00001111 code) 12) 12)) ((= (logand #b11111000 code) #b11110000) (decode-one (ash (logand #b00000111 code) 18) 18)) ((= (logand #b11111100 code) #b11111000) (decode-one (ash (logand #b00000011 code) 24) 24)) ((= (logand #b11111110 code) #b11111100) (decode-one (ash (logand #b00000001 code) 30) 30)))) (decode)) ((= pos length) nil) (t (error "The string ~s does not seem to be UTF-8." string))))) (decode)))))) (defun write-unicode-to-utf-8 (code stream) (cond ((< code #x80) (write-char (code-char code) stream) 1) ((< code #x800) (write-char (code-char (logxor #b11000000 (ash code -6))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 code))) stream) 2) ((< code #x10000) (write-char (code-char (logxor #b11100000 (ash code -12))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -6)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 code))) stream) 3) ((< code #x200000) (write-char (code-char (logxor #b11110000 (ash code -18))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -12)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -6)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 code))) stream) 4) ((< code #x4000000) (write-char (code-char (logxor #b11111000 (ash code -24))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -18)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -12)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -6)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 code))) stream) 5) ((< code #x80000000) (write-char (code-char (logxor #b11111100 (ash code -30))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -24)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -18)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -12)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 (ash code -6)))) stream) (write-char (code-char (logxor #b10000000 (logand #b00111111 code))) stream) 6))) #-mcl (loop for (char entity) on '( ;; iso lat 1 ;; aacute #\ "á" ;; ;; Aacute #\ "Á" ;; ;; acirc #\ "â" ;; ;; Acirc #\ "Â" ;; ;; agrave #\ "à" ;; ;; Agrave #\ "À" ;; ;; aring #\ "å" ;; ;; Aring #\ "Å" ;; ;; atilde #\ "ã" ;; ;; Atilde #\ "Ã" ;; ;; auml #\ "ä" ;; ;; Auml #\ "Ä" ;; ;; aelig #\ "æ" ;; ;; AElig #\ "Æ" ;; ;; ccedil #\ "ç" ;; ;; Ccedil #\ "Ç" ;; ;; eth #\ "ð" ;; ;; ETH #\ "Ð" ;; ;; eacute #\ "é" ;; ;; Eacute #\ "É" ;; ;; ecirc #\ "ê" ;; ;; Ecirc #\ "Ê" ;; ;; egrave #\ "è" ;; ;; Egrave #\ "È" ;; ;; euml #\ "ë" ;; ;; Euml #\ "Ë" ;; ;; iacute #\ "í" ;; ;; Iacute #\ "Í" ;; ;; icirc #\ "î" ;; ;; Icirc #\ "Î" ;; ;; igrave #\ "ì" ;; ;; Igrave #\ "Ì" ;; ;; iuml #\ "ï" ;; ;; Iuml #\ "Ï" ;; ;; ntilde #\ "ñ" ;; ;; Ntilde #\ "Ñ" ;; ;; oacute #\ "ó" ;; ;; Oacute #\ "Ó" ;; ;; ocirc #\ "ô" ;; ;; Ocirc #\ "Ô" ;; ;; ograve #\ "ò" ;; ;; Ograve #\ "Ò" ;; ;; oslash #\ "ø" ;; ;; Oslash #\ "Ø" ;; ;; otilde #\ "õ" ;; ;; Otilde #\ "Õ" ;; ;; ouml #\ "ö" ;; ;; Ouml #\ "Ö" ;; ;; szlig #\ "ß" ;; ;; thorn #\ "þ" ;; ;; THORN #\ "Þ" ;; ;; uacute #\ "ú" ;; ;; Uacute #\ "Ú" ;; ;; ucirc #\ "û" ;; ;; Ucirc #\ "Û" ;; ;; ugrave #\ "ù" ;; ;; Ugrave #\ "Ù" ;; ;; uuml #\ "ü" ;; ;; Uuml #\ "Ü" ;; ;; yacute #\ "ý" ;; ;; Yacute #\ "Ý" ;; ;; yuml #\ "ÿ" ;; ;; ;; iso lat 2 ;; abreve #\x "ă" ;; ;; Abreve #\x "Ă" ;; ;; amacr #\x "ā" ;; ;; Amacr #\x "Ā" ;; ;; aogon #\x "ą" ;; ;; Aogon #\x "Ą" ;; ;; cacute #\x "ć" ;; ;; Cacute #\x "Ć" ;; ;; ccaron #\x "č" ;; ;; Ccaron #\x "Č" ;; ;; ccirc #\x "ĉ" ;; ;; Ccirc #\x "Ĉ" ;; ;; cdot #\x "ċ" ;; ;; Cdot #\x "Ċ" ;; ;; dcaron #\x "ď" ;; ;; Dcaron #\x "Ď" ;; ;; dstrok #\x "đ" ;; ;; Dstrok #\x "Đ" ;; ;; ecaron #\x "ě" ;; ;; Ecaron #\x "Ě" ;; ;; edot #\x "ė" ;; ;; Edot #\x "Ė" ;; ;; emacr #\x "ē" ;; ;; Emacr #\x "Ē" ;; ;; eogon #\x "ę" ;; ;; Eogon #\x "Ę" ;; ;; gacute #\x "ǵ" ;; ;; gbreve #\x "ğ" ;; ;; Gbreve #\x "Ğ" ;; ;; Gcedil #\x "Ģ" ;; ;; gcirc #\x "ĝ" ;; ;; Gcirc #\x "Ĝ" ;; ;; gdot #\x "ġ" ;; ;; Gdot #\x "Ġ" ;; ;; hcirc #\x "ĥ" ;; ;; Hcirc #\x "Ĥ" ;; ;; hstrok #\x "ħ" ;; ;; Hstrok #\x "Ħ" ;; ;; Idot #\x "İ" ;; ;; Imacr #\x "Ī" ;; ;; imacr #\x "ī" ;; ;; ijlig #\x "ij" ;; ;; IJlig #\x "IJ" ;; ;; inodot #\x "ı" ;; ;; iogon #\x "į" ;; ;; Iogon #\x "Į" ;; ;; itilde #\x "ĩ" ;; ;; Itilde #\x "Ĩ" ;; ;; jcirc #\x "ĵ" ;; ;; Jcirc #\x "Ĵ" ;; ;; kcedil #\x "ķ" ;; ;; Kcedil #\x "Ķ" ;; ;; kgreen #\x "ĸ" ;; ;; lacute #\x "ĺ" ;; ;; Lacute #\x "Ĺ" ;; ;; lcaron #\x "ľ" ;; ;; Lcaron #\x "Ľ" ;; ;; lcedil #\x "ļ" ;; ;; Lcedil #\x "Ļ" ;; ;; lmidot #\x "ŀ" ;; ;; Lmidot #\x "Ŀ" ;; ;; lstrok #\x "ł" ;; ;; Lstrok #\x "Ł" ;; ;; nacute #\x "ń" ;; ;; Nacute #\x "Ń" ;; ;; eng #\x "ŋ" ;; ;; ENG #\x "Ŋ" ;; ;; napos #\x "ʼn" ;; ;; ncaron #\x "ň" ;; ;; Ncaron #\x "Ň" ;; ;; ncedil #\x "ņ" ;; ;; Ncedil #\x "Ņ" ;; ;; odblac #\x "ő" ;; ;; Odblac #\x "Ő" ;; ;; Omacr #\x "Ō" ;; ;; omacr #\x "ō" ;; ;; oelig #\x "œ" ;; ;; OElig #\x "Œ" ;; ;; racute #\x "ŕ" ;; ;; Racute #\x "Ŕ" ;; ;; rcaron #\x "ř" ;; ;; Rcaron #\x "Ř" ;; ;; rcedil #\x "ŗ" ;; ;; Rcedil #\x "Ŗ" ;; ;; sacute #\x "ś" ;; ;; Sacute #\x "Ś" ;; ;; scaron #\x "š" ;; ;; Scaron #\x "Š" ;; ;; scedil #\x "ş" ;; ;; Scedil #\x "Ş" ;; ;; scirc #\x "ŝ" ;; ;; Scirc #\x "Ŝ" ;; ;; tcaron #\x "ť" ;; ;; Tcaron #\x "Ť" ;; ;; tcedil #\x "ţ" ;; ;; Tcedil #\x "Ţ" ;; ;; tstrok #\x "ŧ" ;; ;; Tstrok #\x "Ŧ" ;; ;; ubreve #\x "ŭ" ;; ;; Ubreve #\x "Ŭ" ;; ;; udblac #\x "ű" ;; ;; Udblac #\x "Ű" ;; ;; umacr #\x "ū" ;; ;; Umacr #\x "Ū" ;; ;; uogon #\x "ų" ;; ;; Uogon #\x "Ų" ;; ;; uring #\x "ů" ;; ;; Uring #\x "Ů" ;; ;; utilde #\x "ũ" ;; ;; Utilde #\x "Ũ" ;; ;; wcirc #\x "ŵ" ;; ;; Wcirc #\x "Ŵ" ;; ;; ycirc #\x "ŷ" ;; ;; Ycirc #\x "Ŷ" ;; ;; Yuml #\x "Ÿ" ;; ;; zacute #\x "ź" ;; ;; Zacute #\x "Ź" ;; ;; zcaron #\x "ž" ;; ;; Zcaron #\x "Ž" ;; ;; zdot #\x "ż" ;; ;; Zdot #\x "Ż" ;; ;; ;; iso-num ;; half #\x "½" ;; ;; frac12 #\x "½" ;; ;; frac14 #\x "¼" ;; ;; frac34 #\x "¾" ;; ;; frac18 #\x "⅛" ;; ;; frac38 #\x "⅜" ;; ;; frac58 #\x "⅝" ;; ;; frac78 #\x "⅞" ;; ;; sup1 #\x "¹" ;; ;; sup2 #\x "²" ;; ;; sup3 #\x "³" ;; ;; plus #\x "+" ;; ;; plusmn #\x "±" ;; ;; lt #\< "&#60;" ;; ;; equals #\= "=" ;; ;; gt #\> ">" ;; ;; divide #\/ "÷" ;; ;; times #\x "×" ;; ;; curren #\x "¤" ;; ;; pound #\x "£" ;; ;; dollar #\x "$" ;; ;; cent #\x "¢" ;; ;; yen #\x "¥" ;; ;; num #\x "#" ;; ;; percnt #\x "%" ;; ;; amp #\& "&#38;" ;; ;; ast #\x "*" ;; ;; commat #\x "@" ;; ;; lsqb #\x "[" ;; ;; bsol #\x "\" ;; ;; rsqb #\x "]" ;; ;; lcub #\x "{" ;; ;; horbar #\x "―" ;; ;; verbar #\x "|" ;; ;; rcub #\x "}" ;; ;; micro #\x "µ" ;; ;; ohm #\x "Ω" ;; ;; deg #\x "°" ;; ;; ordm #\x "º" ;; ;; ordf #\x "ª" ;; ;; sect #\x "§" ;; ;; para #\x "¶" ;; ;; middot #\x "·" ;; ;; larr #\x "←" ;; ;; rarr #\x "→" ;; ;; uarr #\x "↑" ;; ;; darr #\x "↓" ;; ;; copy #\x "©" ;; ;; reg #\x "®" ;; ;; trade #\x "™" ;; ;; brvbar #\x "¦" ;; ;; not #\x "¬" ;; ;; sung #\x "♩" ;; ;; excl #\x "!" ;; ;; iexcl #\x "¡" ;; ;; quot #\x """ ;; ;; apos #\x "'" ;; ;; lpar #\x "(" ;; ;; rpar #\x ")" ;; ;; comma #\x "," ;; ;; lowbar #\x "_" ;; ;; hyphen #\x "-" ;; ;; period #\x "." ;; ;; sol #\x "/" ;; ;; colon #\x ":" ;; ;; semi #\x ";" ;; ;; quest #\x "?" ;; ;; iquest #\x "¿" ;; ;; laquo #\x "«" ;; ;; raquo #\x "»" ;; ;; lsquo #\x "'" #\x "‘" ;; ;; rsquo #\x "'" #\x "’" ;; ;; ldquo #\x """ #\x "“" ;; ;; rdquo #\x """ #\x "”" ;; ;; nbsp #\x " " ;; ;; shy #\x "­" ;; ;; ;; iso-pub ;; emsp #\x " " ;; ;; ensp #\x " " ;; ;; emsp13 #\x " " ;; ;; emsp14 #\x " " ;; ;; numsp #\x " " ;; ;; puncsp #\x " " ;; ;; thinsp #\x " " ;; ;; hairsp #\x " " ;; ;; mdash #\x "-#45;" #\x "—" ;; ;; ndash #\x "-#45;" #\x "–" ;; ;; dash #\x "-#45;" #\x "‐" ;; ;; blank #\x "␣" ;; ;; hellip #\x "…" ;; ;; nldr #\x "‥" ;; ;; frac13 #\x "⅓" ;; ;; frac23 #\x "⅔" ;; ;; frac15 #\x "⅕" ;; ;; frac25 #\x "⅖" ;; ;; frac35 #\x "⅗" ;; ;; frac45 #\x "⅘" ;; ;; frac16 #\x "⅙" ;; ;; frac56 #\x "⅚" ;; ;; incare #\x "℅" ;; ;; block #\x "█" ;; ;; uhblk #\x "▀" ;; ;; lhblk #\x "▄" ;; ;; blk14 #\x "░" ;; ;; blk12 #\x "▒" ;; ;; blk34 #\x "▓" ;; ;; marker #\x "▮" ;; ;; cir #\x "○" ;; ;; squ #\x "□" ;; ;; rect #\x "▭" ;; ;; utri #\x "▵" ;; ;; dtri #\x "▿" ;; ;; star #\x "⋆" ;; ;; bull #\x "•" ;; ;; squf #\x "▪" ;; ;; utrif #\x "▴" ;; ;; dtrif #\x "▾" ;; ;; ltrif #\x "◂" ;; ;; rtrif #\x "▸" ;; ;; clubs #\x "♣" ;; ;; diams #\x "♦" ;; ;; hearts #\x "♥" ;; ;; spades #\x "♠" ;; ;; malt #\x "✠" ;; ;; dagger #\x "†" ;; ;; Dagger #\x "‡" ;; ;; check #\x "✓" ;; ;; cross #\x "✗" ;; ;; sharp #\x "♯" ;; ;; flat #\x "♭" ;; ;; male #\x "♂" ;; ;; female #\x "♀" ;; ;; phone #\x "☎" ;; ;; telrec #\x "⌕" ;; ;; copysr #\x "℗" ;; ;; caret #\x "⁁" ;; ;; lsquor #\x "‚" ;; ;; ldquor #\x "„" ;; ;; fflig #\x "ff" ;; ;; filig #\x "fi" ;; ;; ;; ;; ffilig #\x "ffi" ;; ;; ffllig #\x "ffl" ;; ;; fllig #\x "fl" ;; ;; mldr #\x "…" ;; ;; rdquor #\x "“" ;; ;; rsquor #\x "‘" ;; ;; vellip #\x "⋮" ;; ;; hybull #\x "⁃" ;; ;; loz #\x "◊" ;; ;; lozf #\x "✦" ;; ;; ltri #\x "◃" ;; ;; rtri #\x "▹" ;; ;; starf #\x "★" ;; ;; natur #\x "♮" ;; ;; rx #\x "℞" ;; ;; sext #\x "✶" ;; ;; target #\x "⌖" ;; ;; dlcrop #\x "⌍" ;; ;; drcrop #\x "⌌" ;; ;; ulcrop #\x "⌏" ;; ;; urcrop #\x "⌎" ;; ) by #'cddr do (setf (gethash (char-code char) *utf-8-table*) (entity-to-utf-8 entity))) #+mcl (loop for (char entity) on '( ;; iso lat 1 ;; aacute #\ "á" ;; ;; Aacute #\ "Á" ;; ;; acirc #\ "â" ;; ;; Acirc #\ "Â" ;; ;; agrave #\ "à" ;; ;; Agrave #\ "À" ;; ;; aring #\ "å" ;; ;; Aring #\ "Å" ;; ;; atilde #\ "ã" ;; ;; Atilde #\ "Ã" ;; ;; auml #\ "ä" ;; ;; Auml #\ "Ä" ;; ;; aelig #\ "æ" ;; ;; AElig #\ "Æ" ;; ;; ccedil #\c "ç" ;; ;; Ccedil #\c "Ç" ;; ;; eth #\ "ð" ;; ;; ETH #\ "Ð" ;; ;; eacute #\ "é" ;; ;; Eacute #\ "É" ;; ;; ecirc #\ "ê" ;; ;; Ecirc #\ "Ê" ;; ;; egrave #\ "è" ;; ;; Egrave #\ "È" ;; ;; euml #\ "ë" ;; ;; Euml #\ "Ë" ;; ;; iacute #\ "í" ;; ;; Iacute #\ "Í" ;; ;; icirc #\ "î" ;; ;; Icirc #\ "Î" ;; ;; igrave #\ "ì" ;; ;; Igrave #\ "Ì" ;; ;; iuml #\ "ï" ;; ;; Iuml #\ "Ï" ;; ;; ntilde #\ "ñ" ;; ;; Ntilde #\ "Ñ" ;; ;; oacute #\ "ó" ;; ;; Oacute #\ "Ó" ;; ;; ocirc #\ "ô" ;; ;; Ocirc #\ "Ô" ;; ;; ograve #\ "ò" ;; ;; Ograve #\ "Ò" ;; ;; oslash #\ "ø" ;; ;; Oslash #\ "Ø" ;; ;; otilde #\ "õ" ;; ;; Otilde #\ "Õ" ;; ;; ouml #\ "ö" ;; ;; Ouml #\ "Ö" ;; ;; szlig #\ "ß" ;; ;; thorn #\ "þ" ;; ;; THORN #\ "Þ" ;; ;; uacute #\ "ú" ;; ;; Uacute #\ "Ú" ;; ;; ucirc #\ "û" ;; ;; Ucirc #\ "Û" ;; ;; ugrave #\ "ù" ;; ;; Ugrave #\ "Ù" ;; ;; uuml #\ "ü" ;; ;; Uuml #\ "Ü" ;; ;; yacute #\ "ý" ;; ;; Yacute #\ "Ý" ;; ;; yuml #\ "ÿ" ;; ;; ;; iso lat 2 ;; abreve #\x "ă" ;; ;; Abreve #\x "Ă" ;; ;; amacr #\x "ā" ;; ;; Amacr #\x "Ā" ;; ;; aogon #\x "ą" ;; ;; Aogon #\x "Ą" ;; ;; cacute #\x "ć" ;; ;; Cacute #\x "Ć" ;; ;; ccaron #\x "č" ;; ;; Ccaron #\x "Č" ;; ;; ccirc #\x "ĉ" ;; ;; Ccirc #\x "Ĉ" ;; ;; cdot #\x "ċ" ;; ;; Cdot #\x "Ċ" ;; ;; dcaron #\x "ď" ;; ;; Dcaron #\x "Ď" ;; ;; dstrok #\x "đ" ;; ;; Dstrok #\x "Đ" ;; ;; ecaron #\x "ě" ;; ;; Ecaron #\x "Ě" ;; ;; edot #\x "ė" ;; ;; Edot #\x "Ė" ;; ;; emacr #\x "ē" ;; ;; Emacr #\x "Ē" ;; ;; eogon #\x "ę" ;; ;; Eogon #\x "Ę" ;; ;; gacute #\x "ǵ" ;; ;; gbreve #\x "ğ" ;; ;; Gbreve #\x "Ğ" ;; ;; Gcedil #\x "Ģ" ;; ;; gcirc #\x "ĝ" ;; ;; Gcirc #\x "Ĝ" ;; ;; gdot #\x "ġ" ;; ;; Gdot #\x "Ġ" ;; ;; hcirc #\x "ĥ" ;; ;; Hcirc #\x "Ĥ" ;; ;; hstrok #\x "ħ" ;; ;; Hstrok #\x "Ħ" ;; ;; Idot #\x "İ" ;; ;; Imacr #\x "Ī" ;; ;; imacr #\x "ī" ;; ;; ijlig #\x "ij" ;; ;; IJlig #\x "IJ" ;; ;; inodot #\x "ı" ;; ;; iogon #\x "į" ;; ;; Iogon #\x "Į" ;; ;; itilde #\x "ĩ" ;; ;; Itilde #\x "Ĩ" ;; ;; jcirc #\x "ĵ" ;; ;; Jcirc #\x "Ĵ" ;; ;; kcedil #\x "ķ" ;; ;; Kcedil #\x "Ķ" ;; ;; kgreen #\x "ĸ" ;; ;; lacute #\x "ĺ" ;; ;; Lacute #\x "Ĺ" ;; ;; lcaron #\x "ľ" ;; ;; Lcaron #\x "Ľ" ;; ;; lcedil #\x "ļ" ;; ;; Lcedil #\x "Ļ" ;; ;; lmidot #\x "ŀ" ;; ;; Lmidot #\x "Ŀ" ;; ;; lstrok #\x "ł" ;; ;; Lstrok #\x "Ł" ;; ;; nacute #\x "ń" ;; ;; Nacute #\x "Ń" ;; ;; eng #\x "ŋ" ;; ;; ENG #\x "Ŋ" ;; ;; napos #\x "ʼn" ;; ;; ncaron #\x "ň" ;; ;; Ncaron #\x "Ň" ;; ;; ncedil #\x "ņ" ;; ;; Ncedil #\x "Ņ" ;; ;; odblac #\x "ő" ;; ;; Odblac #\x "Ő" ;; ;; Omacr #\x "Ō" ;; ;; omacr #\x "ō" ;; ;; oelig #\x "œ" ;; ;; OElig #\x "Œ" ;; ;; racute #\x "ŕ" ;; ;; Racute #\x "Ŕ" ;; ;; rcaron #\x "ř" ;; ;; Rcaron #\x "Ř" ;; ;; rcedil #\x "ŗ" ;; ;; Rcedil #\x "Ŗ" ;; ;; sacute #\x "ś" ;; ;; Sacute #\x "Ś" ;; ;; scaron #\x "š" ;; ;; Scaron #\x "Š" ;; ;; scedil #\x "ş" ;; ;; Scedil #\x "Ş" ;; ;; scirc #\x "ŝ" ;; ;; Scirc #\x "Ŝ" ;; ;; tcaron #\x "ť" ;; ;; Tcaron #\x "Ť" ;; ;; tcedil #\x "ţ" ;; ;; Tcedil #\x "Ţ" ;; ;; tstrok #\x "ŧ" ;; ;; Tstrok #\x "Ŧ" ;; ;; ubreve #\x "ŭ" ;; ;; Ubreve #\x "Ŭ" ;; ;; udblac #\x "ű" ;; ;; Udblac #\x "Ű" ;; ;; umacr #\x "ū" ;; ;; Umacr #\x "Ū" ;; ;; uogon #\x "ų" ;; ;; Uogon #\x "Ų" ;; ;; uring #\x "ů" ;; ;; Uring #\x "Ů" ;; ;; utilde #\x "ũ" ;; ;; Utilde #\x "Ũ" ;; ;; wcirc #\x "ŵ" ;; ;; Wcirc #\x "Ŵ" ;; ;; ycirc #\x "ŷ" ;; ;; Ycirc #\x "Ŷ" ;; ;; Yuml #\x "Ÿ" ;; ;; zacute #\x "ź" ;; ;; Zacute #\x "Ź" ;; ;; zcaron #\x "ž" ;; ;; Zcaron #\x "Ž" ;; ;; zdot #\x "ż" ;; ;; Zdot #\x "Ż" ;; ;; ;; iso-num ;; half #\x "½" ;; ;; frac12 #\x "½" ;; ;; frac14 #\x "¼" ;; ;; frac34 #\x "¾" ;; ;; frac18 #\x "⅛" ;; ;; frac38 #\x "⅜" ;; ;; frac58 #\x "⅝" ;; ;; frac78 #\x "⅞" ;; ;; sup1 #\x "¹" ;; ;; sup2 #\x "²" ;; ;; sup3 #\x "³" ;; ;; plus #\x "+" ;; ;; plusmn #\x "±" ;; ;; lt #\< "&#60;" ;; ;; equals #\= "=" ;; ;; gt #\> ">" ;; ;; divide #\/ "÷" ;; ;; times #\x "×" ;; ;; curren #\x "¤" ;; ;; pound #\x "£" ;; ;; dollar #\x "$" ;; ;; cent #\x "¢" ;; ;; yen #\x "¥" ;; ;; num #\x "#" ;; ;; percnt #\x "%" ;; ;; amp #\& "&#38;" ;; ;; ast #\x "*" ;; ;; commat #\x "@" ;; ;; lsqb #\x "[" ;; ;; bsol #\x "\" ;; ;; rsqb #\x "]" ;; ;; lcub #\x "{" ;; ;; horbar #\x "―" ;; ;; verbar #\x "|" ;; ;; rcub #\x "}" ;; ;; micro #\x "µ" ;; ;; ohm #\x "Ω" ;; ;; deg #\x "°" ;; ;; ordm #\x "º" ;; ;; ordf #\x "ª" ;; ;; sect #\x "§" ;; ;; para #\x "¶" ;; ;; middot #\x "·" ;; ;; larr #\x "←" ;; ;; rarr #\x "→" ;; ;; uarr #\x "↑" ;; ;; darr #\x "↓" ;; ;; copy #\x "©" ;; ;; reg #\x "®" ;; ;; trade #\x "™" ;; ;; brvbar #\x "¦" ;; ;; not #\x "¬" ;; ;; sung #\x "♩" ;; ;; excl #\x "!" ;; ;; iexcl #\x "¡" ;; ;; quot #\x """ ;; ;; apos #\x "'" ;; ;; lpar #\x "(" ;; ;; rpar #\x ")" ;; ;; comma #\x "," ;; ;; lowbar #\x "_" ;; ;; hyphen #\x "-" ;; ;; period #\x "." ;; ;; sol #\x "/" ;; ;; colon #\x ":" ;; ;; semi #\x ";" ;; ;; quest #\x "?" ;; ;; iquest #\x "¿" ;; ;; laquo #\x "«" ;; ;; raquo #\x "»" ;; ;; lsquo #\x "'" #\x "‘" ;; ;; rsquo #\x "'" #\x "’" ;; ;; ldquo #\x """ #\x "“" ;; ;; rdquo #\x """ #\x "”" ;; ;; nbsp #\x " " ;; ;; shy #\x "­" ;; ;; ;; iso-pub ;; emsp #\x " " ;; ;; ensp #\x " " ;; ;; emsp13 #\x " " ;; ;; emsp14 #\x " " ;; ;; numsp #\x " " ;; ;; puncsp #\x " " ;; ;; thinsp #\x " " ;; ;; hairsp #\x " " ;; ;; mdash #\x "-#45;" #\x "—" ;; ;; ndash #\x "-#45;" #\x "–" ;; ;; dash #\x "-#45;" #\x "‐" ;; ;; blank #\x "␣" ;; ;; hellip #\x "…" ;; ;; nldr #\x "‥" ;; ;; frac13 #\x "⅓" ;; ;; frac23 #\x "⅔" ;; ;; frac15 #\x "⅕" ;; ;; frac25 #\x "⅖" ;; ;; frac35 #\x "⅗" ;; ;; frac45 #\x "⅘" ;; ;; frac16 #\x "⅙" ;; ;; frac56 #\x "⅚" ;; ;; incare #\x "℅" ;; ;; block #\x "█" ;; ;; uhblk #\x "▀" ;; ;; lhblk #\x "▄" ;; ;; blk14 #\x "░" ;; ;; blk12 #\x "▒" ;; ;; blk34 #\x "▓" ;; ;; marker #\x "▮" ;; ;; cir #\x "○" ;; ;; squ #\x "□" ;; ;; rect #\x "▭" ;; ;; utri #\x "▵" ;; ;; dtri #\x "▿" ;; ;; star #\x "⋆" ;; ;; bull #\x "•" ;; ;; squf #\x "▪" ;; ;; utrif #\x "▴" ;; ;; dtrif #\x "▾" ;; ;; ltrif #\x "◂" ;; ;; rtrif #\x "▸" ;; ;; clubs #\x "♣" ;; ;; diams #\x "♦" ;; ;; hearts #\x "♥" ;; ;; spades #\x "♠" ;; ;; malt #\x "✠" ;; ;; dagger #\x "†" ;; ;; Dagger #\x "‡" ;; ;; check #\x "✓" ;; ;; cross #\x "✗" ;; ;; sharp #\x "♯" ;; ;; flat #\x "♭" ;; ;; male #\x "♂" ;; ;; female #\x "♀" ;; ;; phone #\x "☎" ;; ;; telrec #\x "⌕" ;; ;; copysr #\x "℗" ;; ;; caret #\x "⁁" ;; ;; lsquor #\x "‚" ;; ;; ldquor #\x "„" ;; ;; fflig #\x "ff" ;; ;; filig #\x "fi" ;; ;; ;; ;; ffilig #\x "ffi" ;; ;; ffllig #\x "ffl" ;; ;; fllig #\x "fl" ;; ;; mldr #\x "…" ;; ;; rdquor #\x "“" ;; ;; rsquor #\x "‘" ;; ;; vellip #\x "⋮" ;; ;; hybull #\x "⁃" ;; ;; loz #\x "◊" ;; ;; lozf #\x "✦" ;; ;; ltri #\x "◃" ;; ;; rtri #\x "▹" ;; ;; starf #\x "★" ;; ;; natur #\x "♮" ;; ;; rx #\x "℞" ;; ;; sext #\x "✶" ;; ;; target #\x "⌖" ;; ;; dlcrop #\x "⌍" ;; ;; drcrop #\x "⌌" ;; ;; ulcrop #\x "⌏" ;; ;; urcrop #\x "⌎" ;; ) by #'cddr do (setf (gethash (char-code char) *utf-8-table*) (entity-to-utf-8 entity))) ;;; EOF