Projects/worblehat-old
Projects
/
worblehat-old
Archived
12
0
Fork 0
This repository has been archived on 2024-07-04. You can view files and clone it, but cannot push or open issues or pull requests.
worblehat-old/enc-marc-8.lisp

551 lines
18 KiB
Common Lisp
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

(in-package #:z3950)
(babel-encodings::define-character-encoding :marc-8
"Encoding developed incrementally since 1968 for use with the MARC
format for bibliographic records. Although UTF-8 can now be used with
MARC, many systems (including Library of Congress) still use the MARC
8 encoding."
)
;;; Decoding tables.
(defconstant undef #xfffd)
(defparameter *basic-ascii-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents (loop for i from #x21 below #x7f collect i))
"http://lcweb2.loc.gov/diglib/codetables/42.html")
(defparameter *greek-symbols-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents `(,@(loop for i from #x21 below #x61 collect undef)
#x03b1 #x03b2 #x03b3
,@(loop for i from #x64 below #x7f collect undef)))
"http://lcweb2.loc.gov/diglib/codetables/2.html")
(defparameter *subscripts-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(,@(loop for marc-8 from #x21 below #x28 collect undef)
#x208d #x208e ,undef #x208a ,undef #x208b ,undef ,undef
,@(loop for marc-8 from #x30 below #x3a
and ucs from #x2080
collect ucs)
,@(loop for marc-8 from #x3a below #x7f collect undef)))
"http://lcweb2.loc.gov/diglib/codetables/3.html")
(defparameter *superscripts-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(,@(loop for marc-8 from #x21 below #x28 collect undef)
#x207d #x207e ,undef #x207a ,undef #x207b ,undef ,undef
#x2070 #x00b9 #x00b2 #x00b3
,@(loop for marc-8 from #x34 below #x3a
and ucs from #x2074
collect ucs)
,@(loop for marc-8 from #x3a below #x7f collect undef)))
"http://lcweb2.loc.gov/diglib/codetables/4.html")
(defparameter *extended-latin-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(#x0141 #x00d8 #x0110 #x00de #x00c6 #x0152 #x02b9 ; a1--a7
#x00b7 #x266d #x00ae #x00b1 #x01a0 #x01af #x02bc ,undef ; af
#x02bb #x0142 #x00f8 #x0111 #x00fe #x00e6 #x0153 #x02ba ; b7
#x0131 #x00a3 #x00f0 ,undef #x01a1 #x01b0 ,undef ,undef ; bf
#x00b0 #x2113 #x2117 #x00a9 #x266f #x00bf #x00a1 #x00df ; c7
#x20ac ,@(loop for i from #xc9 below #xe0 collect undef)
#x0309 #x0300 #x0301 #x0302 #x0303 #x0304 #x0306 #x0307 ; e7
#x0308 #x030c #x030a ; ea
#xfe20 #xfe21 ; see note 1
#x0315 #x030b #x0310 ; ef
#x0327 #x0328 #x0323 #x0324 #x0325 #x0333 #x0332 #x0326 ; f7
#x031c #x032e ; f9
#xfe22 #xfe23 ; see note 2
,undef ,undef #x0313))
"http://lcweb2.loc.gov/diglib/codetables/45.html")
(defparameter *basic-hebrew-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(#x21 #x05f4 #x23 #x24 #x25 #x26 #x05f3
#x28 #x29 #x2a #x2b #x2c #x05be #x2e #x2f
,@(loop for marc-8 from #x30 below #x40 collect marc-8)
#x05b7 #x05b8 #x05b6 #x05b5 #x05b4 #x05b9 #x05bb #x05b0
#x05b2 #x05b3 #x05b1 #x05bc #x05bf #x05c1 #xfb1e
,@(loop for marc-8 from #x4f below #x5b collect undef)
#x5b ,undef #x5d ,undef ,undef
,@(loop for marc-8 from #x60 below #x7b and ucs from #x05d0
collect ucs)
#x05f0 #x05f1 #x05f2 ,undef))
"http://lcweb2.loc.gov/diglib/codetables/32.html")
(defparameter *basic-cyrillic-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(,@(loop for marc-8 from #x21 below #x40 collect marc-8)
#x044e #x0430 #x0431 #x0446 #x0434 #x0435 #x0444 #x0433
#x0445 #x0438 #x0439 #x043a #x043b #x043c #x043d #x043e
#x043f #x044f #x0440 #x0441 #x0442 #x0443 #x0436 #x0432
#x044c #x044b #x0437 #x0448 #x044d #x0449 #x0447 #x044a
#x042e #x0410 #x0411 #x0426 #x0414 #x0415 #x0424 #x0413
#x0425 #x0418 #x0419 #x041a #x041b #x041c #x041d #x041e
#x041f #x042f #x0420 #x0421 #x0422 #x0423 #x0416 #x0412
#x042c #x042b #x0417 #x0428 #x042d #x0429 #x0427))
"http://lcweb2.loc.gov/diglib/codetables/4E.html")
(defparameter *extended-cyrillic-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(,@(loop for i from #x21 below #x40 collect undef)
#x0491 #x0452 #x0453 #x0454 #x0451 #x0455 #x0456 #x0457
#x0458 #x0459 #x045A #x045B #x045C #x045E #x045F ,undef
#x0463 #x0473 #x0475 #x046B
,@(loop for i from #x54 below #x5b collect undef)
#x005B ,undef #x005D ,undef #x005F
#x0490 #x0402 #x0403 #x0404 #x0401 #x0405 #x0406 #x0407
#x0408 #x0409 #x040A #x040B #x040C #x040E #x040F #x042A
#x0462 #x0472 #x0474 #x046A
,@(loop for i from #x74 below #x7f collect undef)))
"http://lcweb2.loc.gov/diglib/codetables/51.html")
(defparameter *basic-arabic-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(#x21 #x22 #x23 #x24 #x066a #x26 #x27
#x28 #x29 #x066d #x2b #x060c #x2d #x2e #x2f
#x0660 #x0661 #x0662 #x0663 #x0664 #x0665 #x0666 #x0667
#x0668 #x0669 #x003a #x061b #x003c #x003d #x003e #x061f
,undef
,@(loop for m from #x41 below #x5b and ucs from #x0621 collect ucs)
#x5b ,undef #xfd ,undef ,undef
,@(loop for m from #x60 below #x73 and ucs from #x0640 collect ucs)
#x0671 #x0670 ,undef ,undef ,undef
#x066c #x201d #x201c ,undef ,undef ,undef ,undef))
"http://lcweb2.loc.gov/diglib/codetables/33.html")
(defparameter *extended-arabic-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`(#x06fd #x0672 #x0673
,@(loop for m from #x24 below #x32 and ucs from #x0679 collect ucs)
#x06bf
,@(loop for m from #x33 below #x49 and u from #x0687 collect u)
#x06fa #x069d #x069e #x06fb #x069f #x06a0 #x06fc
,@(loop for m from #x50 below #x6c and u from #x06a1 collect u)
#x06b9 #x06be #x06c0 #x06c4
#x06c5 #x06c6 #x06ca #x06cb #x06cd #x06ce #x06d0 #x06d2
#x06d3 ,undef ,undef ,undef ,undef #x0306 #x030c))
"http://lcweb2.loc.gov/diglib/codetables/34.html")
(defparameter *basic-greek-decode-table*
(make-array
#x5e :element-type '(unsigned-byte 16)
:initial-contents
`( #x0300 #x0301 #x0308 #x0342 #x0313 #x0314 #x0345
,undef ,undef ,undef ,undef ,undef ,undef ,undef ,undef
#x00ab #x00bb #x201c #x201d #x0374 #x0375 ,undef ,undef
,undef ,undef ,undef #x0387 ,undef ,undef ,undef #x037e
,undef #x0391 #x0392 ,undef #x0393 #x0394 #x0395 #x03da
#x03dc #x0396 #x0397 #x0398 #x0399 #x039a #x039b #x039c
#x039d #x039e #x039f #x03a0 #x03de #x03a1 #x03a3 ,undef
#x03a4 #x03a5 #x03a6 #x03a7 #x03a8 #x03a9 #x03e0 ,undef
,undef #x03b1 #x03b2 #x03d0 #x03b3 #x03b4 #x03b5 #x03db
#x03dd #x03b6 #x03b7 #x03b8 #x03b9 #x03ba #x03bb #x03bc
#x03bd #x03be #x03bf #x03c0 #x03df #x03c1 #x03c3 #x03c2
#x03c4 #x03c5 #x03c6 #x03c7 #x03c8 #x03c9 #x03e1))
"http://lcweb2.loc.gov/diglib/codetables/53.html")
(defparameter *eacc-decode-table*
(let ((ht (make-hash-table)))
(with-open-file (input (asdf:system-relative-pathname '#:z3950
#p"eacc2uni.txt"))
(let ((start 0) end m u)
(loop
for line = (read-line input nil nil)
while line
do
(setq end (position #\, line)
m (parse-integer line :end end :radix 16)
start (1+ end)
end (position #\, line :start start)
u (parse-integer line :start start :end end :radix 16))
(setf (gethash m ht) u))))
ht)
"http://www.loc.gov/marc/specifications/specchareacc.html")
;;; Decoder.
(defmacro illegal-escape (getter src i)
(alexandria:with-unique-names (j m)
`(error "Illegal escape sequence:~{ ~X~}"
(reverse (loop with ,j = ,i
for ,m = (,getter ,src (decf ,j))
collect ,m
until (= ,m #x1b))))))
(babel-encodings::define-code-point-counter :marc-8 (getter type)
`(lambda (seq start end max)
(declare (type ,type seq) (fixnum start end max))
(let ((nchars 0)
(g0-width 1)
(g1-width 1)
(i start))
(macrolet ((input () `(prog1 (,',getter seq i) (incf i))))
(tagbody
start
(when (= i end)
(go accept))
(let ((m (input)))
(cond ((= m #x1b) (go esc))
((or (= m #xa0) (= m #xff)) (go reserved))
((< m #x21)
(incf nchars)
(go start))
((< m #x7f) (cond ((= g0-width 1)
(incf nchars)
(go start))
(t (go triple-2))))
((< m #xa1)
(incf nchars)
(go start))
(t (cond ((= g1-width 1)
(incf nchars)
(go start))
(t (go triple-2))))))
esc
(when (= i end)
(go illegal-escape))
(case (input)
((#x67 #x62 #x70 #x73) (go start))
((#x28 #x2c #x29 #x2d) (go intermediate-single-1))
(#x24 (go intermediate-multiple-1))
(t (go illegal-escape)))
intermediate-single-1
(when (= i end)
(go illegal-escape))
(case (input)
((#x33 #x34 #x42 #x4e #x51 #x53 #x32) (go start))
(#x21 (go intermediate-single-2))
(t (go illegal-escape)))
intermediate-single-2
(when (= i end)
(go illegal-escape))
(case (input)
(#x45 (go start))
(t (go illegal-escape)))
intermediate-multiple-1
(when (= i end)
(go illegal-escape))
(case (input)
(#x2c (go intermediate-multiple-2-g0))
(#x31 (setf g0-width 3)
(go start))
((#x29 #x2d) (go intermediate-multiple-2-g1))
(t (go illegal-escape)))
intermediate-multiple-2-g0
(when (= i end)
(go illegal-escape))
(case (input)
(#x31 (setf g0-width 3)
(go start))
(t (go illegal-escape)))
intermediate-multiple-2-g1
(when (= i end)
(go illegal-escape))
(case (input)
(#x31 (setf g1-width 3)
(go start))
(t (go illegal-escape)))
triple-2
(when (= i end)
(go partial))
(input)
(go triple-3)
triple-3
(when (= i end)
(go partial))
(input)
(incf nchars)
(go start)
partial
(error "Partial multiple-byte character.")
reserved
(error "~D is a reserved value and must not be used in MARC 8-encoded records." (,getter seq (1- i)))
illegal-escape
(illegal-escape ,getter seq i)
accept
))
(values nchars end))))
(babel-encodings::define-decoder :marc-8 (getter src-type setter dest-type)
`(lambda (src start end dest d-start)
(declare (type ,src-type src)
(type ,dest-type dest)
(fixnum start end d-start))
(let ((g0 *basic-ascii-decode-table*)
(g1 *extended-latin-decode-table*)
m
(g0-width 1) (g1-width 1)
(i start)
(di d-start))
(labels ((iso2022 (u)
(ecase u
(#x33 *basic-arabic-decode-table*)
(#x34 *extended-arabic-decode-table*)
(#x42 *basic-ascii-decode-table*)
(#x21 (case (,getter src (incf i))
(#x45 *extended-latin-decode-table*)
(t (illegal-escape ,getter src i))))
(#x31 *eacc-decode-table*)
(#x4e *basic-cyrillic-decode-table*)
(#x51 *extended-cyrillic-decode-table*)
(#x53 *basic-greek-decode-table*)
(#x32 *basic-hebrew-decode-table*))))
(macrolet ((input () `(prog1 (,',getter src i) (incf i))))
(tagbody
start
(when (= i end) (go accept))
(setq m (input))
(cond
((= m #x1b) (go esc))
((or (= m #xa0) (= m #xff)) (go reserved))
((< m #x21)
(,setter m dest di)
(incf di)
(go start))
((< m #x7f) (cond ((= g0-width 1)
(,setter (aref g0 (- m #x21)) dest di)
(incf di)
(go start))
(t (go triple-2-g0))))
((< m #xa1)
(,setter m dest di)
(incf di)
(go start))
(t (cond ((= g1-width 1)
(,setter (aref g1 (- m #xa1)) dest di)
(incf di)
(go start))
(t (go triple-2-g1)))))
esc
(when (= i end) (go illegal-escape))
(case (input)
(#x67 (setf g0 *basic-greek-decode-table*
g0-width 1)
(go start))
(#x62 (setf g0 *subscripts-decode-table*
g0-width 1)
(go start))
(#x70 (setf g0 *superscripts-decode-table*
g0-width 1)
(go start))
(#x73 (setf g0 *basic-ascii-decode-table*
g0-width 1)
(go start))
((#x28 #x2c)
(go intermediate-single-1-g0))
((#x29 #x2d)
(go intermediate-single-1-g1))
(#x24 (go intermediate-multiple-1))
(t (go illegal-escape)))
intermediate-single-1-g0
(when (= i end) (go illegal-escape))
(case (input)
((#x33 #x34 #x42 #x4e #x51 #x53 #x32)
(setf g0 (iso2022 (,getter src (1- i)))
g0-width 1)
(go start))
(#x21 (go intermediate-single-2-g0))
(t (go illegal-escape)))
intermediate-single-1-g1
(when (= i end) (go illegal-escape))
(case (input)
((#x33 #x34 #x42 #x4e #x51 #x53 #x32)
(setf g1 (iso2022 (,getter src (1- i)))
g1-width 1)
(go start))
(#x21 (go intermediate-single-2-g1))
(t (go illegal-escape)))
intermediate-single-2-g0
(when (= i end) (go illegal-escape))
(case (input)
(#x45 (setf g0 *extended-latin-decode-table*
g0-width 1)
(go start))
(t (go illegal-escape)))
intermediate-single-2-g1
(when (= i end) (go illegal-escape))
(case (input)
(#x45 (setf g1 *extended-latin-decode-table*
g1-width 1)
(go start))
(t (go illegal-escape)))
intermediate-multiple-1
(when (= i end) (go illegal-escape))
(case (input)
(#x2c (go intermediate-multiple-2-g0))
((#x29 #x2d) (go intermediate-multiple-2-g1))
(#x31 (setf g0 *eacc-decode-table*
g0-width 3)
(go start))
(t (go illegal-escape)))
intermediate-multiple-2-g0
(when (= i end) (go illegal-escape))
(case (input)
(#x31 (setf g0 *eacc-decode-table*
g0-width 3)
(go start))
(t (go illegal-escape)))
intermediate-multiple-2-g1
(when (= i end) (go illegal-escape))
(case (input)
(#x31 (setf g1 *eacc-decode-table*
g1-width 3)
(go start))
(t (go illegal-escape)))
triple-2-g0
(when (= i end)
(go partial))
(input)
(go triple-3-g0)
triple-2-g1
(when (= i end)
(go partial))
(input)
(go triple-3-g1)
triple-3-g0
(when (= i end)
(go partial))
(input)
(let* ((m1 (,getter src (- i 3)))
(m2 (,getter src (- i 2)))
(m3 (,getter src (- i 1)))
(key (+ (* (+ (* m1 #x100) m2) #x100) m3)))
(,setter (or (gethash key g0 nil) ,undef)
dest di)
(incf di))
(go start)
triple-3-g1
(when (= i end)
(go partial))
(input)
(let* ((m1 (- (,getter src (- i 3)) #x80))
(m2 (- (,getter src (- i 2)) #x80))
(m3 (- (,getter src (- i 1)) #x80))
(key (+ (* (+ (* m1 #x100) m2) #x100) m3)))
(,setter (or (gethash key g1 nil) ,undef)
dest di)
(incf di))
(go start)
partial
(error "Partial multiple-byte character.")
reserved
(error "~D is a reserved value and must not be used in MARC 8-encoded records." (,getter src (1- i)))
illegal-escape
(illegal-escape ,getter src i)
accept
))))))
;;; This seems to be the only way to get Babel to use the new decoder.
(defparameter babel::*string-vector-mappings*
(babel-encodings:instantiate-concrete-mappings
:octet-seq-setter babel::ub-set
:octet-seq-getter babel::ub-get
:octet-seq-type (simple-array (unsigned-byte 8) (*))
:code-point-seq-setter babel::string-set
:code-point-seq-getter babel::string-get
:code-point-seq-type babel:simple-unicode-string))
;;; Testing.
(defun ub8v (&rest numbers)
(make-array (length numbers) :element-type '(unsigned-byte 8)
:initial-contents numbers))
(defun ub16v (&rest numbers)
(make-array (length numbers) :element-type '(unsigned-byte 16)
:initial-contents numbers))
(defun test-kjorbye-marc-8 ()
(ub8v 75 105 178 114 98 121 101 32 66 101 114 116 101 108 115 101 110 44
32 73 110 103 101 114))
(defun eacc (s)
(let* ((key (+ (* (+ (* (char-code (char s 0)) #x100)
(char-code (char s 1))) #x100)
(char-code (char s 2))))
(value (gethash key *eacc-decode-table*)))
(format t "~x~%" value)
(code-char value)))
(define-test decode
(assert-equal "Kiørbye Bertelsen, Inger"
(babel:octets-to-string (test-kjorbye-marc-8)
:encoding :marc-8))
(assert-equal "β-tubulin"
(babel:octets-to-string (babel:string-to-octets
"gbs-tubulin")
:encoding :marc-8))
(assert-equal "羅貫中 ; 張亦文譯."
(babel:octets-to-string (babel:string-to-octets "$1!R7!Yn!04(B ; $1!=G!0\\!BX!YF(B.")
:encoding :marc-8))
(assert-equal "羅貫中 ; 張亦文譯."
(babel:octets-to-string (babel:string-to-octets "$)1¡Ò·¡Ùî¡°´ ; ¡½Ç¡°Ü¡ÂØ¡ÙÆ)!E." :encoding :latin-1)
:encoding :marc-8)))
(define-test encode
(assert-equalp (test-kjorbye-marc-8)
(babel:string-to-octets "Kiørbye Bertelsen, Inger"
:encoding :marc-8))
(assert-equal "gbs-tubulin"
(babel:octets-to-string
(babel:string-to-octets "β-tubulin" :encoding :marc-8))))
;;; Unicode normalization.
(defun tt-to-number (line)
(multiple-value-bind (a b c d) (ppcre:scan "<tt>([^<]+)</tt>" line)
(declare (ignore a b))
(loop
with start = (aref c 0)
while (> (aref d 0) start)
for end = (or (position #\Space line :start start :end (aref d 0))
(aref d 0))
collect (parse-integer line :start start :end end :radix 16)
do (setf start (1+ end)))))
(defun read-normalization-table ()
(with-open-file (stream "/Users/ljosa/tmp/chart_Latin.html")
(let ((result nil))
(tagbody
(loop
for line = (read-line stream nil nil)
when (not line)
do (go done)
until (string-equal line "<tr>"))
(let ((a (tt-to-number (read-line stream)))
(b (tt-to-number (read-line stream)))
(c (tt-to-number (read-line stream)))
(d (tt-to-number (read-line stream)))
(e (tt-to-number (read-line stream))))
(declare (ignore a d e))
(push (list b c) result))
done)
(nreverse result))))