551 lines
18 KiB
Common Lisp
551 lines
18 KiB
Common Lisp
(in-package #:z3950)
|
||
|
||
(babel-encodings::define-character-encoding :marc-8
|
||
"Encoding developed incrementally since 1968 for use with the MARC
|
||
format for bibliographic records. Although UTF-8 can now be used with
|
||
MARC, many systems (including Library of Congress) still use the MARC
|
||
8 encoding."
|
||
)
|
||
|
||
|
||
;;; Decoding tables.
|
||
|
||
(defconstant undef #xfffd)
|
||
|
||
(defparameter *basic-ascii-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents (loop for i from #x21 below #x7f collect i))
|
||
"http://lcweb2.loc.gov/diglib/codetables/42.html")
|
||
|
||
(defparameter *greek-symbols-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents `(,@(loop for i from #x21 below #x61 collect undef)
|
||
#x03b1 #x03b2 #x03b3
|
||
,@(loop for i from #x64 below #x7f collect undef)))
|
||
"http://lcweb2.loc.gov/diglib/codetables/2.html")
|
||
|
||
(defparameter *subscripts-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`(,@(loop for marc-8 from #x21 below #x28 collect undef)
|
||
#x208d #x208e ,undef #x208a ,undef #x208b ,undef ,undef
|
||
,@(loop for marc-8 from #x30 below #x3a
|
||
and ucs from #x2080
|
||
collect ucs)
|
||
,@(loop for marc-8 from #x3a below #x7f collect undef)))
|
||
"http://lcweb2.loc.gov/diglib/codetables/3.html")
|
||
|
||
(defparameter *superscripts-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`(,@(loop for marc-8 from #x21 below #x28 collect undef)
|
||
#x207d #x207e ,undef #x207a ,undef #x207b ,undef ,undef
|
||
#x2070 #x00b9 #x00b2 #x00b3
|
||
,@(loop for marc-8 from #x34 below #x3a
|
||
and ucs from #x2074
|
||
collect ucs)
|
||
,@(loop for marc-8 from #x3a below #x7f collect undef)))
|
||
"http://lcweb2.loc.gov/diglib/codetables/4.html")
|
||
|
||
(defparameter *extended-latin-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`(#x0141 #x00d8 #x0110 #x00de #x00c6 #x0152 #x02b9 ; a1--a7
|
||
#x00b7 #x266d #x00ae #x00b1 #x01a0 #x01af #x02bc ,undef ; af
|
||
#x02bb #x0142 #x00f8 #x0111 #x00fe #x00e6 #x0153 #x02ba ; b7
|
||
#x0131 #x00a3 #x00f0 ,undef #x01a1 #x01b0 ,undef ,undef ; bf
|
||
#x00b0 #x2113 #x2117 #x00a9 #x266f #x00bf #x00a1 #x00df ; c7
|
||
#x20ac ,@(loop for i from #xc9 below #xe0 collect undef)
|
||
#x0309 #x0300 #x0301 #x0302 #x0303 #x0304 #x0306 #x0307 ; e7
|
||
#x0308 #x030c #x030a ; ea
|
||
#xfe20 #xfe21 ; see note 1
|
||
#x0315 #x030b #x0310 ; ef
|
||
#x0327 #x0328 #x0323 #x0324 #x0325 #x0333 #x0332 #x0326 ; f7
|
||
#x031c #x032e ; f9
|
||
#xfe22 #xfe23 ; see note 2
|
||
,undef ,undef #x0313))
|
||
"http://lcweb2.loc.gov/diglib/codetables/45.html")
|
||
|
||
(defparameter *basic-hebrew-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`(#x21 #x05f4 #x23 #x24 #x25 #x26 #x05f3
|
||
#x28 #x29 #x2a #x2b #x2c #x05be #x2e #x2f
|
||
,@(loop for marc-8 from #x30 below #x40 collect marc-8)
|
||
#x05b7 #x05b8 #x05b6 #x05b5 #x05b4 #x05b9 #x05bb #x05b0
|
||
#x05b2 #x05b3 #x05b1 #x05bc #x05bf #x05c1 #xfb1e
|
||
,@(loop for marc-8 from #x4f below #x5b collect undef)
|
||
#x5b ,undef #x5d ,undef ,undef
|
||
,@(loop for marc-8 from #x60 below #x7b and ucs from #x05d0
|
||
collect ucs)
|
||
#x05f0 #x05f1 #x05f2 ,undef))
|
||
"http://lcweb2.loc.gov/diglib/codetables/32.html")
|
||
|
||
(defparameter *basic-cyrillic-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`(,@(loop for marc-8 from #x21 below #x40 collect marc-8)
|
||
#x044e #x0430 #x0431 #x0446 #x0434 #x0435 #x0444 #x0433
|
||
#x0445 #x0438 #x0439 #x043a #x043b #x043c #x043d #x043e
|
||
#x043f #x044f #x0440 #x0441 #x0442 #x0443 #x0436 #x0432
|
||
#x044c #x044b #x0437 #x0448 #x044d #x0449 #x0447 #x044a
|
||
#x042e #x0410 #x0411 #x0426 #x0414 #x0415 #x0424 #x0413
|
||
#x0425 #x0418 #x0419 #x041a #x041b #x041c #x041d #x041e
|
||
#x041f #x042f #x0420 #x0421 #x0422 #x0423 #x0416 #x0412
|
||
#x042c #x042b #x0417 #x0428 #x042d #x0429 #x0427))
|
||
"http://lcweb2.loc.gov/diglib/codetables/4E.html")
|
||
|
||
(defparameter *extended-cyrillic-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`(,@(loop for i from #x21 below #x40 collect undef)
|
||
#x0491 #x0452 #x0453 #x0454 #x0451 #x0455 #x0456 #x0457
|
||
#x0458 #x0459 #x045A #x045B #x045C #x045E #x045F ,undef
|
||
#x0463 #x0473 #x0475 #x046B
|
||
,@(loop for i from #x54 below #x5b collect undef)
|
||
#x005B ,undef #x005D ,undef #x005F
|
||
#x0490 #x0402 #x0403 #x0404 #x0401 #x0405 #x0406 #x0407
|
||
#x0408 #x0409 #x040A #x040B #x040C #x040E #x040F #x042A
|
||
#x0462 #x0472 #x0474 #x046A
|
||
,@(loop for i from #x74 below #x7f collect undef)))
|
||
"http://lcweb2.loc.gov/diglib/codetables/51.html")
|
||
|
||
(defparameter *basic-arabic-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`(#x21 #x22 #x23 #x24 #x066a #x26 #x27
|
||
#x28 #x29 #x066d #x2b #x060c #x2d #x2e #x2f
|
||
#x0660 #x0661 #x0662 #x0663 #x0664 #x0665 #x0666 #x0667
|
||
#x0668 #x0669 #x003a #x061b #x003c #x003d #x003e #x061f
|
||
,undef
|
||
,@(loop for m from #x41 below #x5b and ucs from #x0621 collect ucs)
|
||
#x5b ,undef #xfd ,undef ,undef
|
||
,@(loop for m from #x60 below #x73 and ucs from #x0640 collect ucs)
|
||
#x0671 #x0670 ,undef ,undef ,undef
|
||
#x066c #x201d #x201c ,undef ,undef ,undef ,undef))
|
||
"http://lcweb2.loc.gov/diglib/codetables/33.html")
|
||
|
||
(defparameter *extended-arabic-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`(#x06fd #x0672 #x0673
|
||
,@(loop for m from #x24 below #x32 and ucs from #x0679 collect ucs)
|
||
#x06bf
|
||
,@(loop for m from #x33 below #x49 and u from #x0687 collect u)
|
||
#x06fa #x069d #x069e #x06fb #x069f #x06a0 #x06fc
|
||
,@(loop for m from #x50 below #x6c and u from #x06a1 collect u)
|
||
#x06b9 #x06be #x06c0 #x06c4
|
||
#x06c5 #x06c6 #x06ca #x06cb #x06cd #x06ce #x06d0 #x06d2
|
||
#x06d3 ,undef ,undef ,undef ,undef #x0306 #x030c))
|
||
"http://lcweb2.loc.gov/diglib/codetables/34.html")
|
||
|
||
(defparameter *basic-greek-decode-table*
|
||
(make-array
|
||
#x5e :element-type '(unsigned-byte 16)
|
||
:initial-contents
|
||
`( #x0300 #x0301 #x0308 #x0342 #x0313 #x0314 #x0345
|
||
,undef ,undef ,undef ,undef ,undef ,undef ,undef ,undef
|
||
#x00ab #x00bb #x201c #x201d #x0374 #x0375 ,undef ,undef
|
||
,undef ,undef ,undef #x0387 ,undef ,undef ,undef #x037e
|
||
,undef #x0391 #x0392 ,undef #x0393 #x0394 #x0395 #x03da
|
||
#x03dc #x0396 #x0397 #x0398 #x0399 #x039a #x039b #x039c
|
||
#x039d #x039e #x039f #x03a0 #x03de #x03a1 #x03a3 ,undef
|
||
#x03a4 #x03a5 #x03a6 #x03a7 #x03a8 #x03a9 #x03e0 ,undef
|
||
,undef #x03b1 #x03b2 #x03d0 #x03b3 #x03b4 #x03b5 #x03db
|
||
#x03dd #x03b6 #x03b7 #x03b8 #x03b9 #x03ba #x03bb #x03bc
|
||
#x03bd #x03be #x03bf #x03c0 #x03df #x03c1 #x03c3 #x03c2
|
||
#x03c4 #x03c5 #x03c6 #x03c7 #x03c8 #x03c9 #x03e1))
|
||
"http://lcweb2.loc.gov/diglib/codetables/53.html")
|
||
|
||
(defparameter *eacc-decode-table*
|
||
(let ((ht (make-hash-table)))
|
||
(with-open-file (input (asdf:system-relative-pathname '#:z3950
|
||
#p"eacc2uni.txt"))
|
||
(let ((start 0) end m u)
|
||
(loop
|
||
for line = (read-line input nil nil)
|
||
while line
|
||
do
|
||
(setq end (position #\, line)
|
||
m (parse-integer line :end end :radix 16)
|
||
start (1+ end)
|
||
end (position #\, line :start start)
|
||
u (parse-integer line :start start :end end :radix 16))
|
||
(setf (gethash m ht) u))))
|
||
ht)
|
||
"http://www.loc.gov/marc/specifications/specchareacc.html")
|
||
|
||
|
||
;;; Decoder.
|
||
|
||
(defmacro illegal-escape (getter src i)
|
||
(alexandria:with-unique-names (j m)
|
||
`(error "Illegal escape sequence:~{ ~X~}"
|
||
(reverse (loop with ,j = ,i
|
||
for ,m = (,getter ,src (decf ,j))
|
||
collect ,m
|
||
until (= ,m #x1b))))))
|
||
|
||
(babel-encodings::define-code-point-counter :marc-8 (getter type)
|
||
`(lambda (seq start end max)
|
||
(declare (type ,type seq) (fixnum start end max))
|
||
(let ((nchars 0)
|
||
(g0-width 1)
|
||
(g1-width 1)
|
||
(i start))
|
||
(macrolet ((input () `(prog1 (,',getter seq i) (incf i))))
|
||
(tagbody
|
||
start
|
||
(when (= i end)
|
||
(go accept))
|
||
(let ((m (input)))
|
||
(cond ((= m #x1b) (go esc))
|
||
((or (= m #xa0) (= m #xff)) (go reserved))
|
||
((< m #x21)
|
||
(incf nchars)
|
||
(go start))
|
||
((< m #x7f) (cond ((= g0-width 1)
|
||
(incf nchars)
|
||
(go start))
|
||
(t (go triple-2))))
|
||
((< m #xa1)
|
||
(incf nchars)
|
||
(go start))
|
||
(t (cond ((= g1-width 1)
|
||
(incf nchars)
|
||
(go start))
|
||
(t (go triple-2))))))
|
||
esc
|
||
(when (= i end)
|
||
(go illegal-escape))
|
||
(case (input)
|
||
((#x67 #x62 #x70 #x73) (go start))
|
||
((#x28 #x2c #x29 #x2d) (go intermediate-single-1))
|
||
(#x24 (go intermediate-multiple-1))
|
||
(t (go illegal-escape)))
|
||
intermediate-single-1
|
||
(when (= i end)
|
||
(go illegal-escape))
|
||
(case (input)
|
||
((#x33 #x34 #x42 #x4e #x51 #x53 #x32) (go start))
|
||
(#x21 (go intermediate-single-2))
|
||
(t (go illegal-escape)))
|
||
intermediate-single-2
|
||
(when (= i end)
|
||
(go illegal-escape))
|
||
(case (input)
|
||
(#x45 (go start))
|
||
(t (go illegal-escape)))
|
||
intermediate-multiple-1
|
||
(when (= i end)
|
||
(go illegal-escape))
|
||
(case (input)
|
||
(#x2c (go intermediate-multiple-2-g0))
|
||
(#x31 (setf g0-width 3)
|
||
(go start))
|
||
((#x29 #x2d) (go intermediate-multiple-2-g1))
|
||
(t (go illegal-escape)))
|
||
intermediate-multiple-2-g0
|
||
(when (= i end)
|
||
(go illegal-escape))
|
||
(case (input)
|
||
(#x31 (setf g0-width 3)
|
||
(go start))
|
||
(t (go illegal-escape)))
|
||
intermediate-multiple-2-g1
|
||
(when (= i end)
|
||
(go illegal-escape))
|
||
(case (input)
|
||
(#x31 (setf g1-width 3)
|
||
(go start))
|
||
(t (go illegal-escape)))
|
||
triple-2
|
||
(when (= i end)
|
||
(go partial))
|
||
(input)
|
||
(go triple-3)
|
||
triple-3
|
||
(when (= i end)
|
||
(go partial))
|
||
(input)
|
||
(incf nchars)
|
||
(go start)
|
||
partial
|
||
(error "Partial multiple-byte character.")
|
||
reserved
|
||
(error "~D is a reserved value and must not be used in MARC 8-encoded records." (,getter seq (1- i)))
|
||
illegal-escape
|
||
(illegal-escape ,getter seq i)
|
||
accept
|
||
))
|
||
(values nchars end))))
|
||
|
||
(babel-encodings::define-decoder :marc-8 (getter src-type setter dest-type)
|
||
`(lambda (src start end dest d-start)
|
||
(declare (type ,src-type src)
|
||
(type ,dest-type dest)
|
||
(fixnum start end d-start))
|
||
(let ((g0 *basic-ascii-decode-table*)
|
||
(g1 *extended-latin-decode-table*)
|
||
m
|
||
(g0-width 1) (g1-width 1)
|
||
(i start)
|
||
(di d-start))
|
||
(labels ((iso2022 (u)
|
||
(ecase u
|
||
(#x33 *basic-arabic-decode-table*)
|
||
(#x34 *extended-arabic-decode-table*)
|
||
(#x42 *basic-ascii-decode-table*)
|
||
(#x21 (case (,getter src (incf i))
|
||
(#x45 *extended-latin-decode-table*)
|
||
(t (illegal-escape ,getter src i))))
|
||
(#x31 *eacc-decode-table*)
|
||
(#x4e *basic-cyrillic-decode-table*)
|
||
(#x51 *extended-cyrillic-decode-table*)
|
||
(#x53 *basic-greek-decode-table*)
|
||
(#x32 *basic-hebrew-decode-table*))))
|
||
(macrolet ((input () `(prog1 (,',getter src i) (incf i))))
|
||
(tagbody
|
||
start
|
||
(when (= i end) (go accept))
|
||
(setq m (input))
|
||
(cond
|
||
((= m #x1b) (go esc))
|
||
((or (= m #xa0) (= m #xff)) (go reserved))
|
||
((< m #x21)
|
||
(,setter m dest di)
|
||
(incf di)
|
||
(go start))
|
||
((< m #x7f) (cond ((= g0-width 1)
|
||
(,setter (aref g0 (- m #x21)) dest di)
|
||
(incf di)
|
||
(go start))
|
||
(t (go triple-2-g0))))
|
||
((< m #xa1)
|
||
(,setter m dest di)
|
||
(incf di)
|
||
(go start))
|
||
(t (cond ((= g1-width 1)
|
||
(,setter (aref g1 (- m #xa1)) dest di)
|
||
(incf di)
|
||
(go start))
|
||
(t (go triple-2-g1)))))
|
||
esc
|
||
(when (= i end) (go illegal-escape))
|
||
(case (input)
|
||
(#x67 (setf g0 *basic-greek-decode-table*
|
||
g0-width 1)
|
||
(go start))
|
||
(#x62 (setf g0 *subscripts-decode-table*
|
||
g0-width 1)
|
||
(go start))
|
||
(#x70 (setf g0 *superscripts-decode-table*
|
||
g0-width 1)
|
||
(go start))
|
||
(#x73 (setf g0 *basic-ascii-decode-table*
|
||
g0-width 1)
|
||
(go start))
|
||
((#x28 #x2c)
|
||
(go intermediate-single-1-g0))
|
||
((#x29 #x2d)
|
||
(go intermediate-single-1-g1))
|
||
(#x24 (go intermediate-multiple-1))
|
||
(t (go illegal-escape)))
|
||
intermediate-single-1-g0
|
||
(when (= i end) (go illegal-escape))
|
||
(case (input)
|
||
((#x33 #x34 #x42 #x4e #x51 #x53 #x32)
|
||
(setf g0 (iso2022 (,getter src (1- i)))
|
||
g0-width 1)
|
||
(go start))
|
||
(#x21 (go intermediate-single-2-g0))
|
||
(t (go illegal-escape)))
|
||
intermediate-single-1-g1
|
||
(when (= i end) (go illegal-escape))
|
||
(case (input)
|
||
((#x33 #x34 #x42 #x4e #x51 #x53 #x32)
|
||
(setf g1 (iso2022 (,getter src (1- i)))
|
||
g1-width 1)
|
||
(go start))
|
||
(#x21 (go intermediate-single-2-g1))
|
||
(t (go illegal-escape)))
|
||
intermediate-single-2-g0
|
||
(when (= i end) (go illegal-escape))
|
||
(case (input)
|
||
(#x45 (setf g0 *extended-latin-decode-table*
|
||
g0-width 1)
|
||
(go start))
|
||
(t (go illegal-escape)))
|
||
intermediate-single-2-g1
|
||
(when (= i end) (go illegal-escape))
|
||
(case (input)
|
||
(#x45 (setf g1 *extended-latin-decode-table*
|
||
g1-width 1)
|
||
(go start))
|
||
(t (go illegal-escape)))
|
||
intermediate-multiple-1
|
||
(when (= i end) (go illegal-escape))
|
||
(case (input)
|
||
(#x2c (go intermediate-multiple-2-g0))
|
||
((#x29 #x2d) (go intermediate-multiple-2-g1))
|
||
(#x31 (setf g0 *eacc-decode-table*
|
||
g0-width 3)
|
||
(go start))
|
||
(t (go illegal-escape)))
|
||
intermediate-multiple-2-g0
|
||
(when (= i end) (go illegal-escape))
|
||
(case (input)
|
||
(#x31 (setf g0 *eacc-decode-table*
|
||
g0-width 3)
|
||
(go start))
|
||
(t (go illegal-escape)))
|
||
intermediate-multiple-2-g1
|
||
(when (= i end) (go illegal-escape))
|
||
(case (input)
|
||
(#x31 (setf g1 *eacc-decode-table*
|
||
g1-width 3)
|
||
(go start))
|
||
(t (go illegal-escape)))
|
||
triple-2-g0
|
||
(when (= i end)
|
||
(go partial))
|
||
(input)
|
||
(go triple-3-g0)
|
||
triple-2-g1
|
||
(when (= i end)
|
||
(go partial))
|
||
(input)
|
||
(go triple-3-g1)
|
||
triple-3-g0
|
||
(when (= i end)
|
||
(go partial))
|
||
(input)
|
||
(let* ((m1 (,getter src (- i 3)))
|
||
(m2 (,getter src (- i 2)))
|
||
(m3 (,getter src (- i 1)))
|
||
(key (+ (* (+ (* m1 #x100) m2) #x100) m3)))
|
||
(,setter (or (gethash key g0 nil) ,undef)
|
||
dest di)
|
||
(incf di))
|
||
(go start)
|
||
triple-3-g1
|
||
(when (= i end)
|
||
(go partial))
|
||
(input)
|
||
(let* ((m1 (- (,getter src (- i 3)) #x80))
|
||
(m2 (- (,getter src (- i 2)) #x80))
|
||
(m3 (- (,getter src (- i 1)) #x80))
|
||
(key (+ (* (+ (* m1 #x100) m2) #x100) m3)))
|
||
(,setter (or (gethash key g1 nil) ,undef)
|
||
dest di)
|
||
(incf di))
|
||
(go start)
|
||
partial
|
||
(error "Partial multiple-byte character.")
|
||
reserved
|
||
(error "~D is a reserved value and must not be used in MARC 8-encoded records." (,getter src (1- i)))
|
||
illegal-escape
|
||
(illegal-escape ,getter src i)
|
||
accept
|
||
))))))
|
||
|
||
;;; This seems to be the only way to get Babel to use the new decoder.
|
||
|
||
(defparameter babel::*string-vector-mappings*
|
||
(babel-encodings:instantiate-concrete-mappings
|
||
:octet-seq-setter babel::ub-set
|
||
:octet-seq-getter babel::ub-get
|
||
:octet-seq-type (simple-array (unsigned-byte 8) (*))
|
||
:code-point-seq-setter babel::string-set
|
||
:code-point-seq-getter babel::string-get
|
||
:code-point-seq-type babel:simple-unicode-string))
|
||
|
||
|
||
;;; Testing.
|
||
|
||
(defun ub8v (&rest numbers)
|
||
(make-array (length numbers) :element-type '(unsigned-byte 8)
|
||
:initial-contents numbers))
|
||
|
||
(defun ub16v (&rest numbers)
|
||
(make-array (length numbers) :element-type '(unsigned-byte 16)
|
||
:initial-contents numbers))
|
||
|
||
(defun test-kjorbye-marc-8 ()
|
||
(ub8v 75 105 178 114 98 121 101 32 66 101 114 116 101 108 115 101 110 44
|
||
32 73 110 103 101 114))
|
||
|
||
(defun eacc (s)
|
||
(let* ((key (+ (* (+ (* (char-code (char s 0)) #x100)
|
||
(char-code (char s 1))) #x100)
|
||
(char-code (char s 2))))
|
||
(value (gethash key *eacc-decode-table*)))
|
||
(format t "~x~%" value)
|
||
(code-char value)))
|
||
|
||
(define-test decode
|
||
(assert-equal "Kiørbye Bertelsen, Inger"
|
||
(babel:octets-to-string (test-kjorbye-marc-8)
|
||
:encoding :marc-8))
|
||
(assert-equal "β-tubulin"
|
||
(babel:octets-to-string (babel:string-to-octets
|
||
"gbs-tubulin")
|
||
:encoding :marc-8))
|
||
(assert-equal "羅貫中 ; 張亦文譯."
|
||
(babel:octets-to-string (babel:string-to-octets "$1!R7!Yn!04(B ; $1!=G!0\\!BX!YF(B.")
|
||
:encoding :marc-8))
|
||
(assert-equal "羅貫中 ; 張亦文譯."
|
||
(babel:octets-to-string (babel:string-to-octets "$)1¡Ò·¡Ùî¡°´ ; ¡½Ç¡°Ü¡ÂØ¡ÙÆ)!E." :encoding :latin-1)
|
||
:encoding :marc-8)))
|
||
|
||
|
||
(define-test encode
|
||
(assert-equalp (test-kjorbye-marc-8)
|
||
(babel:string-to-octets "Kiørbye Bertelsen, Inger"
|
||
:encoding :marc-8))
|
||
(assert-equal "gbs-tubulin"
|
||
(babel:octets-to-string
|
||
(babel:string-to-octets "β-tubulin" :encoding :marc-8))))
|
||
|
||
|
||
;;; Unicode normalization.
|
||
|
||
(defun tt-to-number (line)
|
||
(multiple-value-bind (a b c d) (ppcre:scan "<tt>([^<]+)</tt>" line)
|
||
(declare (ignore a b))
|
||
(loop
|
||
with start = (aref c 0)
|
||
while (> (aref d 0) start)
|
||
for end = (or (position #\Space line :start start :end (aref d 0))
|
||
(aref d 0))
|
||
collect (parse-integer line :start start :end end :radix 16)
|
||
do (setf start (1+ end)))))
|
||
|
||
(defun read-normalization-table ()
|
||
(with-open-file (stream "/Users/ljosa/tmp/chart_Latin.html")
|
||
(let ((result nil))
|
||
(tagbody
|
||
(loop
|
||
for line = (read-line stream nil nil)
|
||
when (not line)
|
||
do (go done)
|
||
until (string-equal line "<tr>"))
|
||
(let ((a (tt-to-number (read-line stream)))
|
||
(b (tt-to-number (read-line stream)))
|
||
(c (tt-to-number (read-line stream)))
|
||
(d (tt-to-number (read-line stream)))
|
||
(e (tt-to-number (read-line stream))))
|
||
(declare (ignore a d e))
|
||
(push (list b c) result))
|
||
done)
|
||
(nreverse result)))) |