This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
worblehat-old/lisp/enc-marc-8.lisp

551 lines
18 KiB
Common Lisp
Raw Normal View History

(in-package #:z3950)
(babel-encodings::define-character-encoding :marc-8
"Encoding developed incrementally since 1968 for use with the MARC
format for bibliographic records. Although UTF-8 can now be used with
MARC, many systems (including Library of Congress) still use the MARC
8 encoding."
)
;;; Decoding tables.
(defconstant undef #xfffd)
(defparameter *basic-ascii-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents (loop for i from #x21 below #x7f collect i))
"http://lcweb2.loc.gov/diglib/codetables/42.html")
(defparameter *greek-symbols-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents `(,@(loop for i from #x21 below #x61 collect undef)
#x03b1 #x03b2 #x03b3
,@(loop for i from #x64 below #x7f collect undef)))
"http://lcweb2.loc.gov/diglib/codetables/2.html")
(defparameter *subscripts-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(,@(loop for marc-8 from #x21 below #x28 collect undef)
#x208d #x208e ,undef #x208a ,undef #x208b ,undef ,undef
,@(loop for marc-8 from #x30 below #x3a
and ucs from #x2080
collect ucs)
,@(loop for marc-8 from #x3a below #x7f collect undef)))
"http://lcweb2.loc.gov/diglib/codetables/3.html")
(defparameter *superscripts-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(,@(loop for marc-8 from #x21 below #x28 collect undef)
#x207d #x207e ,undef #x207a ,undef #x207b ,undef ,undef
#x2070 #x00b9 #x00b2 #x00b3
,@(loop for marc-8 from #x34 below #x3a
and ucs from #x2074
collect ucs)
,@(loop for marc-8 from #x3a below #x7f collect undef)))
"http://lcweb2.loc.gov/diglib/codetables/4.html")
(defparameter *extended-latin-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(#x0141 #x00d8 #x0110 #x00de #x00c6 #x0152 #x02b9 ; a1--a7
#x00b7 #x266d #x00ae #x00b1 #x01a0 #x01af #x02bc ,undef ; af
#x02bb #x0142 #x00f8 #x0111 #x00fe #x00e6 #x0153 #x02ba ; b7
#x0131 #x00a3 #x00f0 ,undef #x01a1 #x01b0 ,undef ,undef ; bf
#x00b0 #x2113 #x2117 #x00a9 #x266f #x00bf #x00a1 #x00df ; c7
#x20ac ,@(loop for i from #xc9 below #xe0 collect undef)
#x0309 #x0300 #x0301 #x0302 #x0303 #x0304 #x0306 #x0307 ; e7
#x0308 #x030c #x030a ; ea
#xfe20 #xfe21 ; see note 1
#x0315 #x030b #x0310 ; ef
#x0327 #x0328 #x0323 #x0324 #x0325 #x0333 #x0332 #x0326 ; f7
#x031c #x032e ; f9
#xfe22 #xfe23 ; see note 2
,undef ,undef #x0313))
"http://lcweb2.loc.gov/diglib/codetables/45.html")
(defparameter *basic-hebrew-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(#x21 #x05f4 #x23 #x24 #x25 #x26 #x05f3
#x28 #x29 #x2a #x2b #x2c #x05be #x2e #x2f
,@(loop for marc-8 from #x30 below #x40 collect marc-8)
#x05b7 #x05b8 #x05b6 #x05b5 #x05b4 #x05b9 #x05bb #x05b0
#x05b2 #x05b3 #x05b1 #x05bc #x05bf #x05c1 #xfb1e
,@(loop for marc-8 from #x4f below #x5b collect undef)
#x5b ,undef #x5d ,undef ,undef
,@(loop for marc-8 from #x60 below #x7b and ucs from #x05d0
collect ucs)
#x05f0 #x05f1 #x05f2 ,undef))
"http://lcweb2.loc.gov/diglib/codetables/32.html")
(defparameter *basic-cyrillic-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(,@(loop for marc-8 from #x21 below #x40 collect marc-8)
#x044e #x0430 #x0431 #x0446 #x0434 #x0435 #x0444 #x0433
#x0445 #x0438 #x0439 #x043a #x043b #x043c #x043d #x043e
#x043f #x044f #x0440 #x0441 #x0442 #x0443 #x0436 #x0432
#x044c #x044b #x0437 #x0448 #x044d #x0449 #x0447 #x044a
#x042e #x0410 #x0411 #x0426 #x0414 #x0415 #x0424 #x0413
#x0425 #x0418 #x0419 #x041a #x041b #x041c #x041d #x041e
#x041f #x042f #x0420 #x0421 #x0422 #x0423 #x0416 #x0412
#x042c #x042b #x0417 #x0428 #x042d #x0429 #x0427))
"http://lcweb2.loc.gov/diglib/codetables/4E.html")
(defparameter *extended-cyrillic-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(,@(loop for i from #x21 below #x40 collect undef)
#x0491 #x0452 #x0453 #x0454 #x0451 #x0455 #x0456 #x0457
#x0458 #x0459 #x045A #x045B #x045C #x045E #x045F ,undef
#x0463 #x0473 #x0475 #x046B
,@(loop for i from #x54 below #x5b collect undef)
#x005B ,undef #x005D ,undef #x005F
#x0490 #x0402 #x0403 #x0404 #x0401 #x0405 #x0406 #x0407
#x0408 #x0409 #x040A #x040B #x040C #x040E #x040F #x042A
#x0462 #x0472 #x0474 #x046A
,@(loop for i from #x74 below #x7f collect undef)))
"http://lcweb2.loc.gov/diglib/codetables/51.html")
(defparameter *basic-arabic-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(#x21 #x22 #x23 #x24 #x066a #x26 #x27
#x28 #x29 #x066d #x2b #x060c #x2d #x2e #x2f
#x0660 #x0661 #x0662 #x0663 #x0664 #x0665 #x0666 #x0667
#x0668 #x0669 #x003a #x061b #x003c #x003d #x003e #x061f
,undef
,@(loop for m from #x41 below #x5b and ucs from #x0621 collect ucs)
#x5b ,undef #xfd ,undef ,undef
,@(loop for m from #x60 below #x73 and ucs from #x0640 collect ucs)
#x0671 #x0670 ,undef ,undef ,undef
#x066c #x201d #x201c ,undef ,undef ,undef ,undef))
"http://lcweb2.loc.gov/diglib/codetables/33.html")
(defparameter *extended-arabic-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(#x06fd #x0672 #x0673
,@(loop for m from #x24 below #x32 and ucs from #x0679 collect ucs)
#x06bf
,@(loop for m from #x33 below #x49 and u from #x0687 collect u)
#x06fa #x069d #x069e #x06fb #x069f #x06a0 #x06fc
,@(loop for m from #x50 below #x6c and u from #x06a1 collect u)
#x06b9 #x06be #x06c0 #x06c4
#x06c5 #x06c6 #x06ca #x06cb #x06cd #x06ce #x06d0 #x06d2
#x06d3 ,undef ,undef ,undef ,undef #x0306 #x030c))
"http://lcweb2.loc.gov/diglib/codetables/34.html")
(defparameter *basic-greek-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`( #x0300 #x0301 #x0308 #x0342 #x0313 #x0314 #x0345
,undef ,undef ,undef ,undef ,undef ,undef ,undef ,undef
#x00ab #x00bb #x201c #x201d #x0374 #x0375 ,undef ,undef
,undef ,undef ,undef #x0387 ,undef ,undef ,undef #x037e
,undef #x0391 #x0392 ,undef #x0393 #x0394 #x0395 #x03da
#x03dc #x0396 #x0397 #x0398 #x0399 #x039a #x039b #x039c
#x039d #x039e #x039f #x03a0 #x03de #x03a1 #x03a3 ,undef
#x03a4 #x03a5 #x03a6 #x03a7 #x03a8 #x03a9 #x03e0 ,undef
,undef #x03b1 #x03b2 #x03d0 #x03b3 #x03b4 #x03b5 #x03db
#x03dd #x03b6 #x03b7 #x03b8 #x03b9 #x03ba #x03bb #x03bc
#x03bd #x03be #x03bf #x03c0 #x03df #x03c1 #x03c3 #x03c2
#x03c4 #x03c5 #x03c6 #x03c7 #x03c8 #x03c9 #x03e1))
"http://lcweb2.loc.gov/diglib/codetables/53.html")
(defparameter *eacc-decode-table*
(let ((ht (make-hash-table)))
(with-open-file (input (asdf:system-relative-pathname '#:z3950
#p"eacc2uni.txt"))
(let ((start 0) end m u)
(loop
for line = (read-line input nil nil)
while line
do
(setq end (position #\, line)
m (parse-integer line :end end :radix 16)
start (1+ end)
end (position #\, line :start start)
u (parse-integer line :start start :end end :radix 16))
(setf (gethash m ht) u))))
ht)
"http://www.loc.gov/marc/specifications/specchareacc.html")
;;; Decoder.
(defmacro illegal-escape (getter src i)
(alexandria:with-unique-names (j m)
`(error "Illegal escape sequence:~{ ~X~}"
(reverse (loop with ,j = ,i
for ,m = (,getter ,src (decf ,j))
collect ,m
until (= ,m #x1b))))))
(babel-encodings::define-code-point-counter :marc-8 (getter type)
`(lambda (seq start end max)
(declare (type ,type seq) (fixnum start end max))
(let ((nchars 0)
(g0-width 1)
(g1-width 1)
(i start))
(macrolet ((input () `(prog1 (,',getter seq i) (incf i))))
(tagbody
start
(when (= i end)
(go accept))
(let ((m (input)))
(cond ((= m #x1b) (go esc))
((or (= m #xa0) (= m #xff)) (go reserved))
((< m #x21)
(incf nchars)
(go start))
((< m #x7f) (cond ((= g0-width 1)
(incf nchars)
(go start))
(t (go triple-2))))
((< m #xa1)
(incf nchars)
(go start))
(t (cond ((= g1-width 1)
(incf nchars)
(go start))
(t (go triple-2))))))
esc
(when (= i end)
(go illegal-escape))
(case (input)
((#x67 #x62 #x70 #x73) (go start))
((#x28 #x2c #x29 #x2d) (go intermediate-single-1))
(#x24 (go intermediate-multiple-1))
(t (go illegal-escape)))
intermediate-single-1
(when (= i end)
(go illegal-escape))
(case (input)
((#x33 #x34 #x42 #x4e #x51 #x53 #x32) (go start))
(#x21 (go intermediate-single-2))
(t (go illegal-escape)))
intermediate-single-2
(when (= i end)
(go illegal-escape))
(case (input)
(#x45 (go start))
(t (go illegal-escape)))
intermediate-multiple-1
(when (= i end)
(go illegal-escape))
(case (input)
(#x2c (go intermediate-multiple-2-g0))
(#x31 (setf g0-width 3)
(go start))
((#x29 #x2d) (go intermediate-multiple-2-g1))
(t (go illegal-escape)))
intermediate-multiple-2-g0
(when (= i end)
(go illegal-escape))
(case (input)
(#x31 (setf g0-width 3)
(go start))
(t (go illegal-escape)))
intermediate-multiple-2-g1
(when (= i end)
(go illegal-escape))
(case (input)
(#x31 (setf g1-width 3)
(go start))
(t (go illegal-escape)))
triple-2
(when (= i end)
(go partial))
(input)
(go triple-3)
triple-3
(when (= i end)
(go partial))
(input)
(incf nchars)
(go start)
partial
(error "Partial multiple-byte character.")
reserved
(error "~D is a reserved value and must not be used in MARC 8-encoded records." (,getter seq (1- i)))
illegal-escape
(illegal-escape ,getter seq i)
accept
))
(values nchars end))))
(babel-encodings::define-decoder :marc-8 (getter src-type setter dest-type)
`(lambda (src start end dest d-start)
(declare (type ,src-type src)
(type ,dest-type dest)
(fixnum start end d-start))
(let ((g0 *basic-ascii-decode-table*)
(g1 *extended-latin-decode-table*)
m
(g0-width 1) (g1-width 1)
(i start)
(di d-start))
(labels ((iso2022 (u)
(ecase u
(#x33 *basic-arabic-decode-table*)
(#x34 *extended-arabic-decode-table*)
(#x42 *basic-ascii-decode-table*)
(#x21 (case (,getter src (incf i))
(#x45 *extended-latin-decode-table*)
(t (illegal-escape ,getter src i))))
(#x31 *eacc-decode-table*)
(#x4e *basic-cyrillic-decode-table*)
(#x51 *extended-cyrillic-decode-table*)
(#x53 *basic-greek-decode-table*)
(#x32 *basic-hebrew-decode-table*))))
(macrolet ((input () `(prog1 (,',getter src i) (incf i))))
(tagbody
start
(when (= i end) (go accept))
(setq m (input))
(cond
((= m #x1b) (go esc))
((or (= m #xa0) (= m #xff)) (go reserved))
((< m #x21)
(,setter m dest di)
(incf di)
(go start))
((< m #x7f) (cond ((= g0-width 1)
(,setter (aref g0 (- m #x21)) dest di)
(incf di)
(go start))
(t (go triple-2-g0))))
((< m #xa1)
(,setter m dest di)
(incf di)
(go start))
(t (cond ((= g1-width 1)
(,setter (aref g1 (- m #xa1)) dest di)
(incf di)
(go start))
(t (go triple-2-g1)))))
esc
(when (= i end) (go illegal-escape))
(case (input)
(#x67 (setf g0 *basic-greek-decode-table*
g0-width 1)
(go start))
(#x62 (setf g0 *subscripts-decode-table*
g0-width 1)
(go start))
(#x70 (setf g0 *superscripts-decode-table*
g0-width 1)
(go start))
(#x73 (setf g0 *basic-ascii-decode-table*
g0-width 1)
(go start))
((#x28 #x2c)
(go intermediate-single-1-g0))
((#x29 #x2d)
(go intermediate-single-1-g1))
(#x24 (go intermediate-multiple-1))
(t (go illegal-escape)))
intermediate-single-1-g0
(when (= i end) (go illegal-escape))
(case (input)
((#x33 #x34 #x42 #x4e #x51 #x53 #x32)
(setf g0 (iso2022 (,getter src (1- i)))
g0-width 1)
(go start))
(#x21 (go intermediate-single-2-g0))
(t (go illegal-escape)))
intermediate-single-1-g1
(when (= i end) (go illegal-escape))
(case (input)
((#x33 #x34 #x42 #x4e #x51 #x53 #x32)
(setf g1 (iso2022 (,getter src (1- i)))
g1-width 1)
(go start))
(#x21 (go intermediate-single-2-g1))
(t (go illegal-escape)))
intermediate-single-2-g0
(when (= i end) (go illegal-escape))
(case (input)
(#x45 (setf g0 *extended-latin-decode-table*
g0-width 1)
(go start))
(t (go illegal-escape)))
intermediate-single-2-g1
(when (= i end) (go illegal-escape))
(case (input)
(#x45 (setf g1 *extended-latin-decode-table*
g1-width 1)
(go start))
(t (go illegal-escape)))
intermediate-multiple-1
(when (= i end) (go illegal-escape))
(case (input)
(#x2c (go intermediate-multiple-2-g0))
((#x29 #x2d) (go intermediate-multiple-2-g1))
(#x31 (setf g0 *eacc-decode-table*
g0-width 3)
(go start))
(t (go illegal-escape)))
intermediate-multiple-2-g0
(when (= i end) (go illegal-escape))
(case (input)
(#x31 (setf g0 *eacc-decode-table*
g0-width 3)
(go start))
(t (go illegal-escape)))
intermediate-multiple-2-g1
(when (= i end) (go illegal-escape))
(case (input)
(#x31 (setf g1 *eacc-decode-table*
g1-width 3)
(go start))
(t (go illegal-escape)))
triple-2-g0
(when (= i end)
(go partial))
(input)
(go triple-3-g0)
triple-2-g1
(when (= i end)
(go partial))
(input)
(go triple-3-g1)
triple-3-g0
(when (= i end)
(go partial))
(input)
(let* ((m1 (,getter src (- i 3)))
(m2 (,getter src (- i 2)))
(m3 (,getter src (- i 1)))
(key (+ (* (+ (* m1 #x100) m2) #x100) m3)))
(,setter (or (gethash key g0 nil) ,undef)
dest di)
(incf di))
(go start)
triple-3-g1
(when (= i end)
(go partial))
(input)
(let* ((m1 (- (,getter src (- i 3)) #x80))
(m2 (- (,getter src (- i 2)) #x80))
(m3 (- (,getter src (- i 1)) #x80))
(key (+ (* (+ (* m1 #x100) m2) #x100) m3)))
(,setter (or (gethash key g1 nil) ,undef)
dest di)
(incf di))
(go start)
partial
(error "Partial multiple-byte character.")
reserved
(error "~D is a reserved value and must not be used in MARC 8-encoded records." (,getter src (1- i)))
illegal-escape
(illegal-escape ,getter src i)
accept
))))))
;;; This seems to be the only way to get Babel to use the new decoder.
(defparameter babel::*string-vector-mappings*
(babel-encodings:instantiate-concrete-mappings
:octet-seq-setter babel::ub-set
:octet-seq-getter babel::ub-get
:octet-seq-type (simple-array (unsigned-byte 8) (*))
:code-point-seq-setter babel::string-set
:code-point-seq-getter babel::string-get
:code-point-seq-type babel:simple-unicode-string))
;;; Testing.
(defun ub8v (&rest numbers)
(make-array (length numbers) :element-type '(unsigned-byte 8)
:initial-contents numbers))
(defun ub16v (&rest numbers)
(make-array (length numbers) :element-type '(unsigned-byte 16)
:initial-contents numbers))
(defun test-kjorbye-marc-8 ()
(ub8v 75 105 178 114 98 121 101 32 66 101 114 116 101 108 115 101 110 44
32 73 110 103 101 114))
(defun eacc (s)
(let* ((key (+ (* (+ (* (char-code (char s 0)) #x100)
(char-code (char s 1))) #x100)
(char-code (char s 2))))
(value (gethash key *eacc-decode-table*)))
(format t "~x~%" value)
(code-char value)))
(define-test decode
(assert-equal "Kiørbye Bertelsen, Inger"
(babel:octets-to-string (test-kjorbye-marc-8)
:encoding :marc-8))
(assert-equal "β-tubulin"
(babel:octets-to-string (babel:string-to-octets
"gbs-tubulin")
:encoding :marc-8))
(assert-equal "羅貫中 ; 張亦文譯."
(babel:octets-to-string (babel:string-to-octets "$1!R7!Yn!04(B ; $1!=G!0\\!BX!YF(B.")
:encoding :marc-8))
(assert-equal "羅貫中 ; 張亦文譯."
(babel:octets-to-string (babel:string-to-octets "$)1¡Ò·¡Ùî¡°´ ; ¡½Ç¡°Ü¡ÂØ¡ÙÆ)!E." :encoding :latin-1)
:encoding :marc-8)))
(define-test encode
(assert-equalp (test-kjorbye-marc-8)
(babel:string-to-octets "Kiørbye Bertelsen, Inger"
:encoding :marc-8))
(assert-equal "gbs-tubulin"
(babel:octets-to-string
(babel:string-to-octets "β-tubulin" :encoding :marc-8))))
;;; Unicode normalization.
(defun tt-to-number (line)
(multiple-value-bind (a b c d) (ppcre:scan "<tt>([^<]+)</tt>" line)
(declare (ignore a b))
(loop
with start = (aref c 0)
while (> (aref d 0) start)
for end = (or (position #\Space line :start start :end (aref d 0))
(aref d 0))
collect (parse-integer line :start start :end end :radix 16)
do (setf start (1+ end)))))
(defun read-normalization-table ()
(with-open-file (stream "/Users/ljosa/tmp/chart_Latin.html")
(let ((result nil))
(tagbody
(loop
for line = (read-line stream nil nil)
when (not line)
do (go done)
until (string-equal line "<tr>"))
(let ((a (tt-to-number (read-line stream)))
(b (tt-to-number (read-line stream)))
(c (tt-to-number (read-line stream)))
(d (tt-to-number (read-line stream)))
(e (tt-to-number (read-line stream))))
(declare (ignore a d e))
(push (list b c) result))
done)
(nreverse result))))