(in-package #:z3950) (babel-encodings::define-character-encoding :marc-8 "Encoding developed incrementally since 1968 for use with the MARC format for bibliographic records. Although UTF-8 can now be used with MARC, many systems (including Library of Congress) still use the MARC 8 encoding." ) ;;; Decoding tables. (defconstant undef #xfffd) (defparameter *basic-ascii-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents (loop for i from #x21 below #x7f collect i)) "http://lcweb2.loc.gov/diglib/codetables/42.html") (defparameter *greek-symbols-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(,@(loop for i from #x21 below #x61 collect undef) #x03b1 #x03b2 #x03b3 ,@(loop for i from #x64 below #x7f collect undef))) "http://lcweb2.loc.gov/diglib/codetables/2.html") (defparameter *subscripts-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(,@(loop for marc-8 from #x21 below #x28 collect undef) #x208d #x208e ,undef #x208a ,undef #x208b ,undef ,undef ,@(loop for marc-8 from #x30 below #x3a and ucs from #x2080 collect ucs) ,@(loop for marc-8 from #x3a below #x7f collect undef))) "http://lcweb2.loc.gov/diglib/codetables/3.html") (defparameter *superscripts-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(,@(loop for marc-8 from #x21 below #x28 collect undef) #x207d #x207e ,undef #x207a ,undef #x207b ,undef ,undef #x2070 #x00b9 #x00b2 #x00b3 ,@(loop for marc-8 from #x34 below #x3a and ucs from #x2074 collect ucs) ,@(loop for marc-8 from #x3a below #x7f collect undef))) "http://lcweb2.loc.gov/diglib/codetables/4.html") (defparameter *extended-latin-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(#x0141 #x00d8 #x0110 #x00de #x00c6 #x0152 #x02b9 ; a1--a7 #x00b7 #x266d #x00ae #x00b1 #x01a0 #x01af #x02bc ,undef ; af #x02bb #x0142 #x00f8 #x0111 #x00fe #x00e6 #x0153 #x02ba ; b7 #x0131 #x00a3 #x00f0 ,undef #x01a1 #x01b0 ,undef ,undef ; bf #x00b0 #x2113 #x2117 #x00a9 #x266f #x00bf #x00a1 #x00df ; c7 #x20ac ,@(loop for i from #xc9 below #xe0 collect undef) #x0309 #x0300 #x0301 #x0302 #x0303 #x0304 #x0306 #x0307 ; e7 #x0308 #x030c #x030a ; ea #xfe20 #xfe21 ; see note 1 #x0315 #x030b #x0310 ; ef #x0327 #x0328 #x0323 #x0324 #x0325 #x0333 #x0332 #x0326 ; f7 #x031c #x032e ; f9 #xfe22 #xfe23 ; see note 2 ,undef ,undef #x0313)) "http://lcweb2.loc.gov/diglib/codetables/45.html") (defparameter *basic-hebrew-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(#x21 #x05f4 #x23 #x24 #x25 #x26 #x05f3 #x28 #x29 #x2a #x2b #x2c #x05be #x2e #x2f ,@(loop for marc-8 from #x30 below #x40 collect marc-8) #x05b7 #x05b8 #x05b6 #x05b5 #x05b4 #x05b9 #x05bb #x05b0 #x05b2 #x05b3 #x05b1 #x05bc #x05bf #x05c1 #xfb1e ,@(loop for marc-8 from #x4f below #x5b collect undef) #x5b ,undef #x5d ,undef ,undef ,@(loop for marc-8 from #x60 below #x7b and ucs from #x05d0 collect ucs) #x05f0 #x05f1 #x05f2 ,undef)) "http://lcweb2.loc.gov/diglib/codetables/32.html") (defparameter *basic-cyrillic-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(,@(loop for marc-8 from #x21 below #x40 collect marc-8) #x044e #x0430 #x0431 #x0446 #x0434 #x0435 #x0444 #x0433 #x0445 #x0438 #x0439 #x043a #x043b #x043c #x043d #x043e #x043f #x044f #x0440 #x0441 #x0442 #x0443 #x0436 #x0432 #x044c #x044b #x0437 #x0448 #x044d #x0449 #x0447 #x044a #x042e #x0410 #x0411 #x0426 #x0414 #x0415 #x0424 #x0413 #x0425 #x0418 #x0419 #x041a #x041b #x041c #x041d #x041e #x041f #x042f #x0420 #x0421 #x0422 #x0423 #x0416 #x0412 #x042c #x042b #x0417 #x0428 #x042d #x0429 #x0427)) "http://lcweb2.loc.gov/diglib/codetables/4E.html") (defparameter *extended-cyrillic-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(,@(loop for i from #x21 below #x40 collect undef) #x0491 #x0452 #x0453 #x0454 #x0451 #x0455 #x0456 #x0457 #x0458 #x0459 #x045A #x045B #x045C #x045E #x045F ,undef #x0463 #x0473 #x0475 #x046B ,@(loop for i from #x54 below #x5b collect undef) #x005B ,undef #x005D ,undef #x005F #x0490 #x0402 #x0403 #x0404 #x0401 #x0405 #x0406 #x0407 #x0408 #x0409 #x040A #x040B #x040C #x040E #x040F #x042A #x0462 #x0472 #x0474 #x046A ,@(loop for i from #x74 below #x7f collect undef))) "http://lcweb2.loc.gov/diglib/codetables/51.html") (defparameter *basic-arabic-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(#x21 #x22 #x23 #x24 #x066a #x26 #x27 #x28 #x29 #x066d #x2b #x060c #x2d #x2e #x2f #x0660 #x0661 #x0662 #x0663 #x0664 #x0665 #x0666 #x0667 #x0668 #x0669 #x003a #x061b #x003c #x003d #x003e #x061f ,undef ,@(loop for m from #x41 below #x5b and ucs from #x0621 collect ucs) #x5b ,undef #xfd ,undef ,undef ,@(loop for m from #x60 below #x73 and ucs from #x0640 collect ucs) #x0671 #x0670 ,undef ,undef ,undef #x066c #x201d #x201c ,undef ,undef ,undef ,undef)) "http://lcweb2.loc.gov/diglib/codetables/33.html") (defparameter *extended-arabic-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `(#x06fd #x0672 #x0673 ,@(loop for m from #x24 below #x32 and ucs from #x0679 collect ucs) #x06bf ,@(loop for m from #x33 below #x49 and u from #x0687 collect u) #x06fa #x069d #x069e #x06fb #x069f #x06a0 #x06fc ,@(loop for m from #x50 below #x6c and u from #x06a1 collect u) #x06b9 #x06be #x06c0 #x06c4 #x06c5 #x06c6 #x06ca #x06cb #x06cd #x06ce #x06d0 #x06d2 #x06d3 ,undef ,undef ,undef ,undef #x0306 #x030c)) "http://lcweb2.loc.gov/diglib/codetables/34.html") (defparameter *basic-greek-decode-table* (make-array #x5e :element-type '(unsigned-byte 16) :initial-contents `( #x0300 #x0301 #x0308 #x0342 #x0313 #x0314 #x0345 ,undef ,undef ,undef ,undef ,undef ,undef ,undef ,undef #x00ab #x00bb #x201c #x201d #x0374 #x0375 ,undef ,undef ,undef ,undef ,undef #x0387 ,undef ,undef ,undef #x037e ,undef #x0391 #x0392 ,undef #x0393 #x0394 #x0395 #x03da #x03dc #x0396 #x0397 #x0398 #x0399 #x039a #x039b #x039c #x039d #x039e #x039f #x03a0 #x03de #x03a1 #x03a3 ,undef #x03a4 #x03a5 #x03a6 #x03a7 #x03a8 #x03a9 #x03e0 ,undef ,undef #x03b1 #x03b2 #x03d0 #x03b3 #x03b4 #x03b5 #x03db #x03dd #x03b6 #x03b7 #x03b8 #x03b9 #x03ba #x03bb #x03bc #x03bd #x03be #x03bf #x03c0 #x03df #x03c1 #x03c3 #x03c2 #x03c4 #x03c5 #x03c6 #x03c7 #x03c8 #x03c9 #x03e1)) "http://lcweb2.loc.gov/diglib/codetables/53.html") (defparameter *eacc-decode-table* (let ((ht (make-hash-table))) (with-open-file (input (asdf:system-relative-pathname '#:z3950 #p"eacc2uni.txt")) (let ((start 0) end m u) (loop for line = (read-line input nil nil) while line do (setq end (position #\, line) m (parse-integer line :end end :radix 16) start (1+ end) end (position #\, line :start start) u (parse-integer line :start start :end end :radix 16)) (setf (gethash m ht) u)))) ht) "http://www.loc.gov/marc/specifications/specchareacc.html") ;;; Decoder. (defmacro illegal-escape (getter src i) (alexandria:with-unique-names (j m) `(error "Illegal escape sequence:~{ ~X~}" (reverse (loop with ,j = ,i for ,m = (,getter ,src (decf ,j)) collect ,m until (= ,m #x1b)))))) (babel-encodings::define-code-point-counter :marc-8 (getter type) `(lambda (seq start end max) (declare (type ,type seq) (fixnum start end max)) (let ((nchars 0) (g0-width 1) (g1-width 1) (i start)) (macrolet ((input () `(prog1 (,',getter seq i) (incf i)))) (tagbody start (when (= i end) (go accept)) (let ((m (input))) (cond ((= m #x1b) (go esc)) ((or (= m #xa0) (= m #xff)) (go reserved)) ((< m #x21) (incf nchars) (go start)) ((< m #x7f) (cond ((= g0-width 1) (incf nchars) (go start)) (t (go triple-2)))) ((< m #xa1) (incf nchars) (go start)) (t (cond ((= g1-width 1) (incf nchars) (go start)) (t (go triple-2)))))) esc (when (= i end) (go illegal-escape)) (case (input) ((#x67 #x62 #x70 #x73) (go start)) ((#x28 #x2c #x29 #x2d) (go intermediate-single-1)) (#x24 (go intermediate-multiple-1)) (t (go illegal-escape))) intermediate-single-1 (when (= i end) (go illegal-escape)) (case (input) ((#x33 #x34 #x42 #x4e #x51 #x53 #x32) (go start)) (#x21 (go intermediate-single-2)) (t (go illegal-escape))) intermediate-single-2 (when (= i end) (go illegal-escape)) (case (input) (#x45 (go start)) (t (go illegal-escape))) intermediate-multiple-1 (when (= i end) (go illegal-escape)) (case (input) (#x2c (go intermediate-multiple-2-g0)) (#x31 (setf g0-width 3) (go start)) ((#x29 #x2d) (go intermediate-multiple-2-g1)) (t (go illegal-escape))) intermediate-multiple-2-g0 (when (= i end) (go illegal-escape)) (case (input) (#x31 (setf g0-width 3) (go start)) (t (go illegal-escape))) intermediate-multiple-2-g1 (when (= i end) (go illegal-escape)) (case (input) (#x31 (setf g1-width 3) (go start)) (t (go illegal-escape))) triple-2 (when (= i end) (go partial)) (input) (go triple-3) triple-3 (when (= i end) (go partial)) (input) (incf nchars) (go start) partial (error "Partial multiple-byte character.") reserved (error "~D is a reserved value and must not be used in MARC 8-encoded records." (,getter seq (1- i))) illegal-escape (illegal-escape ,getter seq i) accept )) (values nchars end)))) (babel-encodings::define-decoder :marc-8 (getter src-type setter dest-type) `(lambda (src start end dest d-start) (declare (type ,src-type src) (type ,dest-type dest) (fixnum start end d-start)) (let ((g0 *basic-ascii-decode-table*) (g1 *extended-latin-decode-table*) m (g0-width 1) (g1-width 1) (i start) (di d-start)) (labels ((iso2022 (u) (ecase u (#x33 *basic-arabic-decode-table*) (#x34 *extended-arabic-decode-table*) (#x42 *basic-ascii-decode-table*) (#x21 (case (,getter src (incf i)) (#x45 *extended-latin-decode-table*) (t (illegal-escape ,getter src i)))) (#x31 *eacc-decode-table*) (#x4e *basic-cyrillic-decode-table*) (#x51 *extended-cyrillic-decode-table*) (#x53 *basic-greek-decode-table*) (#x32 *basic-hebrew-decode-table*)))) (macrolet ((input () `(prog1 (,',getter src i) (incf i)))) (tagbody start (when (= i end) (go accept)) (setq m (input)) (cond ((= m #x1b) (go esc)) ((or (= m #xa0) (= m #xff)) (go reserved)) ((< m #x21) (,setter m dest di) (incf di) (go start)) ((< m #x7f) (cond ((= g0-width 1) (,setter (aref g0 (- m #x21)) dest di) (incf di) (go start)) (t (go triple-2-g0)))) ((< m #xa1) (,setter m dest di) (incf di) (go start)) (t (cond ((= g1-width 1) (,setter (aref g1 (- m #xa1)) dest di) (incf di) (go start)) (t (go triple-2-g1))))) esc (when (= i end) (go illegal-escape)) (case (input) (#x67 (setf g0 *basic-greek-decode-table* g0-width 1) (go start)) (#x62 (setf g0 *subscripts-decode-table* g0-width 1) (go start)) (#x70 (setf g0 *superscripts-decode-table* g0-width 1) (go start)) (#x73 (setf g0 *basic-ascii-decode-table* g0-width 1) (go start)) ((#x28 #x2c) (go intermediate-single-1-g0)) ((#x29 #x2d) (go intermediate-single-1-g1)) (#x24 (go intermediate-multiple-1)) (t (go illegal-escape))) intermediate-single-1-g0 (when (= i end) (go illegal-escape)) (case (input) ((#x33 #x34 #x42 #x4e #x51 #x53 #x32) (setf g0 (iso2022 (,getter src (1- i))) g0-width 1) (go start)) (#x21 (go intermediate-single-2-g0)) (t (go illegal-escape))) intermediate-single-1-g1 (when (= i end) (go illegal-escape)) (case (input) ((#x33 #x34 #x42 #x4e #x51 #x53 #x32) (setf g1 (iso2022 (,getter src (1- i))) g1-width 1) (go start)) (#x21 (go intermediate-single-2-g1)) (t (go illegal-escape))) intermediate-single-2-g0 (when (= i end) (go illegal-escape)) (case (input) (#x45 (setf g0 *extended-latin-decode-table* g0-width 1) (go start)) (t (go illegal-escape))) intermediate-single-2-g1 (when (= i end) (go illegal-escape)) (case (input) (#x45 (setf g1 *extended-latin-decode-table* g1-width 1) (go start)) (t (go illegal-escape))) intermediate-multiple-1 (when (= i end) (go illegal-escape)) (case (input) (#x2c (go intermediate-multiple-2-g0)) ((#x29 #x2d) (go intermediate-multiple-2-g1)) (#x31 (setf g0 *eacc-decode-table* g0-width 3) (go start)) (t (go illegal-escape))) intermediate-multiple-2-g0 (when (= i end) (go illegal-escape)) (case (input) (#x31 (setf g0 *eacc-decode-table* g0-width 3) (go start)) (t (go illegal-escape))) intermediate-multiple-2-g1 (when (= i end) (go illegal-escape)) (case (input) (#x31 (setf g1 *eacc-decode-table* g1-width 3) (go start)) (t (go illegal-escape))) triple-2-g0 (when (= i end) (go partial)) (input) (go triple-3-g0) triple-2-g1 (when (= i end) (go partial)) (input) (go triple-3-g1) triple-3-g0 (when (= i end) (go partial)) (input) (let* ((m1 (,getter src (- i 3))) (m2 (,getter src (- i 2))) (m3 (,getter src (- i 1))) (key (+ (* (+ (* m1 #x100) m2) #x100) m3))) (,setter (or (gethash key g0 nil) ,undef) dest di) (incf di)) (go start) triple-3-g1 (when (= i end) (go partial)) (input) (let* ((m1 (- (,getter src (- i 3)) #x80)) (m2 (- (,getter src (- i 2)) #x80)) (m3 (- (,getter src (- i 1)) #x80)) (key (+ (* (+ (* m1 #x100) m2) #x100) m3))) (,setter (or (gethash key g1 nil) ,undef) dest di) (incf di)) (go start) partial (error "Partial multiple-byte character.") reserved (error "~D is a reserved value and must not be used in MARC 8-encoded records." (,getter src (1- i))) illegal-escape (illegal-escape ,getter src i) accept )))))) ;;; This seems to be the only way to get Babel to use the new decoder. (defparameter babel::*string-vector-mappings* (babel-encodings:instantiate-concrete-mappings :octet-seq-setter babel::ub-set :octet-seq-getter babel::ub-get :octet-seq-type (simple-array (unsigned-byte 8) (*)) :code-point-seq-setter babel::string-set :code-point-seq-getter babel::string-get :code-point-seq-type babel:simple-unicode-string)) ;;; Testing. (defun ub8v (&rest numbers) (make-array (length numbers) :element-type '(unsigned-byte 8) :initial-contents numbers)) (defun ub16v (&rest numbers) (make-array (length numbers) :element-type '(unsigned-byte 16) :initial-contents numbers)) (defun test-kjorbye-marc-8 () (ub8v 75 105 178 114 98 121 101 32 66 101 114 116 101 108 115 101 110 44 32 73 110 103 101 114)) (defun eacc (s) (let* ((key (+ (* (+ (* (char-code (char s 0)) #x100) (char-code (char s 1))) #x100) (char-code (char s 2)))) (value (gethash key *eacc-decode-table*))) (format t "~x~%" value) (code-char value))) (define-test decode (assert-equal "Kiørbye Bertelsen, Inger" (babel:octets-to-string (test-kjorbye-marc-8) :encoding :marc-8)) (assert-equal "β-tubulin" (babel:octets-to-string (babel:string-to-octets "gbs-tubulin") :encoding :marc-8)) (assert-equal "羅貫中 ; 張亦文譯." (babel:octets-to-string (babel:string-to-octets "$1!R7!Yn!04(B ; $1!=G!0\\!BX!YF(B.") :encoding :marc-8)) (assert-equal "羅貫中 ; 張亦文譯." (babel:octets-to-string (babel:string-to-octets "$)1¡Ò·¡Ùî¡°´ ; ¡½Ç¡°Ü¡ÂØ¡ÙÆ)!E." :encoding :latin-1) :encoding :marc-8))) (define-test encode (assert-equalp (test-kjorbye-marc-8) (babel:string-to-octets "Kiørbye Bertelsen, Inger" :encoding :marc-8)) (assert-equal "gbs-tubulin" (babel:octets-to-string (babel:string-to-octets "β-tubulin" :encoding :marc-8)))) ;;; Unicode normalization. (defun tt-to-number (line) (multiple-value-bind (a b c d) (ppcre:scan "([^<]+)" line) (declare (ignore a b)) (loop with start = (aref c 0) while (> (aref d 0) start) for end = (or (position #\Space line :start start :end (aref d 0)) (aref d 0)) collect (parse-integer line :start start :end end :radix 16) do (setf start (1+ end))))) (defun read-normalization-table () (with-open-file (stream "/Users/ljosa/tmp/chart_Latin.html") (let ((result nil)) (tagbody (loop for line = (read-line stream nil nil) when (not line) do (go done) until (string-equal line "")) (let ((a (tt-to-number (read-line stream))) (b (tt-to-number (read-line stream))) (c (tt-to-number (read-line stream))) (d (tt-to-number (read-line stream))) (e (tt-to-number (read-line stream)))) (declare (ignore a d e)) (push (list b c) result)) done) (nreverse result))))