From c6a46f0c96dde73ef4f3a247a1e904d4cf15aeb2 Mon Sep 17 00:00:00 2001
From: Nicolas Williams <nico@twosigma.com>
Date: Sun, 11 Sep 2022 00:28:00 -0500
Subject: [PATCH] base: Implement JSON string escaping

We encode JSON in the KDC's audit logs, and soon in bx509d's /get-tgts.
Therefore, we should be reasonable in terms of what we encode.
---
 lib/base/json.c             | 731 ++++++++++++++++++++++++++++++++----
 lib/base/string.c           |   3 +-
 lib/base/test_base.c        | 282 +++++++++++++-
 lib/base/version-script.map |   1 +
 4 files changed, 937 insertions(+), 80 deletions(-)

diff --git a/lib/base/json.c b/lib/base/json.c
index 2ef371b97..cf69e4b00 100644
--- a/lib/base/json.c
+++ b/lib/base/json.c
@@ -166,12 +166,175 @@ base2json(heim_object_t obj, struct twojson *j)
 	j->first = first;
 	break;
 
-    case HEIM_TID_STRING:
+    case HEIM_TID_STRING: {
+	const unsigned char *s = (const unsigned char *)heim_string_get_utf8(obj);
+	const unsigned char *p;
+        unsigned int c, cp, ctop, cbot;
+        char e[sizeof("\\u0123\\u3210")];
+        int good;
+        size_t i;
+
 	indent(j);
 	j->out(j->ctx, "\"");
-	j->out(j->ctx, heim_string_get_utf8(obj));
+        for (p = s; (c = *p); p++) {
+            switch (c) {
+            /* ASCII control characters w/ C-like escapes */
+            case '\b': j->out(j->ctx, "\\b");  continue;
+            case '\f': j->out(j->ctx, "\\f");  continue;
+            case '\n': j->out(j->ctx, "\\n");  continue;
+            case '\r': j->out(j->ctx, "\\r");  continue;
+            case '\t': j->out(j->ctx, "\\t");  continue;
+            /* Other must-escape non-control ASCII characters */
+            case '"':  j->out(j->ctx, "\\\""); continue;
+            case '\\': j->out(j->ctx, "\\\\"); continue;
+            default: break;
+            }
+
+            /*
+             * JSON string encoding is... complex.
+             *
+             * Invalid UTF-8 w/  HEIM_JSON_F_STRICT_STRINGS set -> return 1
+             *
+             * Invalid UTF-8 w/o HEIM_JSON_F_STRICT_STRINGS set -> pass
+             * through, a sort of Heimdal WTF-8, but not _the_ WTF-8.
+             */
+            if (c < 0x20) {
+                /* ASCII control character w/o C-like escape */
+                e[0] = '\\';
+                e[1] = 'u';
+                e[2] = '0';
+                e[3] = '0';
+                e[4] = "0123456789ABCDEF"[c>>4];
+                e[5] = "0123456789ABCDEF"[c & 0x0f];
+                e[6] = '\0';
+                j->out(j->ctx, e);
+                continue;
+            }
+            if (c < 0x80) {
+                /* ASCII */
+                e[0] = c;
+                e[1] = '\0';
+                j->out(j->ctx, e);
+                continue;
+            }
+            if ((c & 0xc0) == 0x80) {
+                /* UTF-8 bare non-leading byte */
+                if (!(j->flags & HEIM_JSON_F_STRICT_STRINGS)) {
+                    e[0] = c;
+                    e[1] = '\0';
+                    j->out(j->ctx, e);
+                    continue;
+                }
+                return 1;
+            }
+            if ((c & 0xe0) == 0xc0) {
+                /* UTF-8 leading byte of two-byte sequence */
+                good = 1;
+                for (i = 1; i < 2 && good && p[i]; i++) {
+                    if ((p[i] & 0xc0) != 0x80)
+                        good = 0;
+                }
+                if (i != 2)
+                    good = 0;
+                if (!good && !(j->flags & HEIM_JSON_F_STRICT_STRINGS)) {
+                    e[0] = c;
+                    e[1] = '\0';
+                    j->out(j->ctx, e);
+                    continue;
+                } else if (!good) {
+                    return 1;
+                }
+                e[0] = c;
+                e[1] = p[1];
+                e[2] = '\0';
+                j->out(j->ctx, e);
+                p += 1;
+                continue;
+            }
+            if ((c & 0xf0) == 0xe0) {
+                /* UTF-8 leading byte of three-byte sequence */
+                good = 1;
+                for (i = 1; i < 3 && good && p[i]; i++) {
+                    if ((p[i] & 0xc0) != 0x80)
+                        good = 0;
+                }
+                if (i != 3)
+                    good = 0;
+                if (!good && !(j->flags & HEIM_JSON_F_STRICT_STRINGS)) {
+                    e[0] = c;
+                    e[1] = '\0';
+                    j->out(j->ctx, e);
+                    continue;
+                } else if (!good) {
+                    return 1;
+                }
+                e[0] = c;
+                e[1] = p[1];
+                e[2] = p[2];
+                e[3] = '\0';
+                j->out(j->ctx, e);
+                p += 2;
+                continue;
+            }
+
+            if (c > 0xf7) {
+                /* Invalid UTF-8 leading byte */
+                if (!(j->flags & HEIM_JSON_F_STRICT_STRINGS)) {
+                    e[0] = c;
+                    e[1] = '\0';
+                    j->out(j->ctx, e);
+                    continue;
+                }
+                return 1;
+            }
+
+            /*
+             * A codepoint > U+FFFF, needs encoding a la UTF-16 surrogate
+             * pair because JSON takes after JS which uses UTF-16.  Ugly.
+             */
+            cp = c & 0x7;
+            good = 1;
+            for (i = 1; i < 4 && good && p[i]; i++) {
+                if ((p[i] & 0xc0) == 0x80)
+                    cp = (cp << 6) | (p[i] & 0x3f);
+                else
+                    good = 0;
+            }
+            if (i != 4)
+                good = 0;
+            if (!good && !(j->flags & HEIM_JSON_F_STRICT_STRINGS)) {
+                e[0] = c;
+                e[1] = '\0';
+                j->out(j->ctx, e);
+                continue;
+            } else if (!good) {
+                return 1;
+            }
+            p += 3;
+
+            cp -= 0x10000;
+            ctop = 0xD800 + (cp >>   10);
+            cbot = 0xDC00 + (cp & 0x3ff);
+
+            e[0 ] = '\\';
+            e[1 ] = 'u';
+            e[2 ] = "0123456789ABCDEF"[(ctop         ) >> 12];
+            e[3 ] = "0123456789ABCDEF"[(ctop & 0x0f00) >>  8];
+            e[4 ] = "0123456789ABCDEF"[(ctop & 0x00f0) >>  4];
+            e[5 ] = "0123456789ABCDEF"[(ctop & 0x000f)      ];
+            e[6 ] = '\\';
+            e[7 ] = 'u';
+            e[8 ] = "0123456789ABCDEF"[(cbot         ) >> 12];
+            e[9 ] = "0123456789ABCDEF"[(cbot & 0x0f00) >>  8];
+            e[10] = "0123456789ABCDEF"[(cbot & 0x00f0) >>  4];
+            e[11] = "0123456789ABCDEF"[(cbot & 0x000f)      ];
+            e[12] = '\0';
+            j->out(j->ctx, e);
+            continue;
+        }
 	j->out(j->ctx, "\"");
 	break;
+    }
 
     case HEIM_TID_DATA: {
 	heim_dict_t d;
@@ -255,9 +418,6 @@ heim_base2json(heim_object_t obj, void *ctx, heim_json_flags_t flags,
 {
     struct twojson j;
 
-    if (flags & HEIM_JSON_F_STRICT_STRINGS)
-	return ENOTSUP; /* Sorry, not yet! */
-
     heim_base_once_f(&heim_json_once, NULL, json_init_once);
 
     j.indent = 0;
@@ -342,93 +502,428 @@ parse_number(struct parse_ctx *ctx)
     return heim_number_create(number * neg);
 }
 
+/*
+ * Read 4 hex digits from ctx->p.
+ *
+ * If we don't have enough, rewind ctx->p and return -1 .
+ */
+static int
+unescape_unicode(struct parse_ctx *ctx)
+{
+    int c = 0;
+    int i;
+
+    for (i = 0; i < 4 && ctx->p < ctx->pend; i++, ctx->p++) {
+        if (*ctx->p >= '0' && *ctx->p <= '9') {
+            c = (c << 4) + (*ctx->p - '0');
+        } else if (*ctx->p >= 'A' && *ctx->p <= 'F') {
+            c = (c << 4) + (10 + *ctx->p - 'A');
+        } else if (*ctx->p >= 'a' && *ctx->p <= 'f') {
+            c = (c << 4) + (10 + *ctx->p - 'a');
+        } else {
+            ctx->p -= i;
+            return -1;
+        }
+    }
+    return c;
+}
+
+static int
+encode_utf8(struct parse_ctx *ctx, char **pp, char *pend, int c)
+{
+    char *p = *pp;
+
+    if (c < 0x80) {
+        /* ASCII */
+        if (p >= pend) return 0;
+        *(p++) = c;
+        *pp = p;
+        return 1;
+    }
+    if (c < 0x800) {
+        /* 2 code unit UTF-8 sequence */
+        if (p >= pend) return 0;
+        *(p++) = 0xc0 | ((c >>  6)       );
+        if (p == pend) return 0;
+        *(p++) = 0x80 | ((c      ) & 0x3f);
+        *pp = p;
+        return 1;
+    }
+    if (c < 0x10000) {
+        /* 3 code unit UTF-8 sequence */
+        if (p >= pend) return 0;
+        *(p++) = 0xe0 | ((c >> 12)       );
+        if (p == pend) return 0;
+        *(p++) = 0x80 | ((c >>  6) & 0x3f);
+        if (p == pend) return 0;
+        *(p++) = 0x80 | ((c)       & 0x3f);
+        *pp = p;
+        return 1;
+    }
+    if (c < 0x110000) {
+        /* 4 code unit UTF-8 sequence */
+        if (p >= pend) return 0;
+        *(p++) = 0xf0 | ((c >> 18)       );
+        if (p == pend) return 0;
+        *(p++) = 0x80 | ((c >> 12) & 0x3f);
+        if (p == pend) return 0;
+        *(p++) = 0x80 | ((c >>  6) & 0x3f);
+        if (p == pend) return 0;
+        *(p++) = 0x80 | ((c)       & 0x3f);
+        *pp = p;
+        return 1;
+    }
+    return 0;
+}
+
+static heim_string_t
+parse_string_error(struct parse_ctx *ctx,
+                   char *freeme,
+                   const char *msg)
+{
+    free(freeme);
+    ctx->error = heim_error_create(EINVAL, "%s at %lu", msg, ctx->lineno);
+    return NULL;
+}
+
 static heim_string_t
 parse_string(struct parse_ctx *ctx)
 {
     const uint8_t *start;
-    int quote = 0;
+    heim_object_t o;
+    size_t alloc_len = 0;
+    size_t need = 0;
+    char *p0, *p, *pend;
+    int strict = ctx->flags & HEIM_JSON_F_STRICT_STRINGS;
+    int binary = 0;
 
-    if (ctx->flags & HEIM_JSON_F_STRICT_STRINGS) {
-	ctx->error = heim_error_create(EINVAL, "Strict JSON string encoding "
-				       "not yet supported");
-	return NULL;
+    if (*ctx->p != '"')
+        return parse_string_error(ctx, NULL,
+                                  "Expected a JSON string but found "
+                                  "something else");
+    start = ++(ctx->p);
+
+    /* Estimate how many bytes we need to allocate */
+    p0 = p = pend = NULL;
+    for (need = 1; ctx->p < ctx->pend; ctx->p++) {
+        need++;
+        if (*ctx->p == '\\')
+            ctx->p++;
+        else if (*ctx->p == '"')
+            break;
     }
+    if (ctx->p == ctx->pend)
+        return parse_string_error(ctx, NULL, "Unterminated JSON string");
 
-    if (*ctx->p != '"') {
-	ctx->error = heim_error_create(EINVAL, "Expected a JSON string but "
-				       "found something else at line %lu",
-				       ctx->lineno);
-	return NULL;
-    }
-    start = ++ctx->p;
-
+    ctx->p = start;
     while (ctx->p < ctx->pend) {
-	if (*ctx->p == '\n') {
-	    ctx->lineno++;
-	} else if (*ctx->p == '\\') {
-	    if (ctx->p + 1 == ctx->pend)
-		goto out;
-	    ctx->p++;
-	    quote = 1;
-	} else if (*ctx->p == '"') {
-	    heim_object_t o;
+        const unsigned char *p_save;
+        int32_t ctop, cbot;
 
-	    if (quote) {
-		char *p0, *p;
-		p = p0 = malloc(ctx->p - start);
-		if (p == NULL)
-		    goto out;
-		while (start < ctx->p) {
-		    if (*start == '\\') {
-			start++;
-			/* XXX validate quoted char */
-		    }
-		    *p++ = *start++;
-		}
-		o = heim_string_create_with_bytes(p0, p - p0);
-		free(p0);
-	    } else {
-		o = heim_string_create_with_bytes(start, ctx->p - start);
-		if (o == NULL) {
-		    ctx->error = heim_error_create_enomem();
-		    return NULL;
-		}
+        if (*ctx->p == '"') {
+            ctx->p++;
+            break;
+        }
 
-		/* If we can decode as base64, then let's */
-		if (ctx->flags & HEIM_JSON_F_TRY_DECODE_DATA) {
-		    void *buf;
-		    size_t len;
-		    const char *s;
+        /* Allocate or resize our output buffer if need be */
+        if (need || p == pend) {
+            char *tmp = realloc(p0, alloc_len + need + 5 /* slop? */);
 
-		    s = heim_string_get_utf8(o);
-		    len = strlen(s);
+            if (tmp == NULL) {
+                ctx->error = heim_error_create_enomem();
+                free(p0);
+                return NULL;
+            }
+            alloc_len += need + 5;
+            p = tmp + (p - p0);
+            p0 = tmp;
+            pend = p0 + alloc_len;
 
-		    if (len >= 4 && strspn(s, base64_chars) >= len - 2) {
-			buf = malloc(len);
-			if (buf == NULL) {
-			    heim_release(o);
-			    ctx->error = heim_error_create_enomem();
-			    return NULL;
-			}
-			len = rk_base64_decode(s, buf);
-			if (len == -1) {
-			    free(buf);
-			    return o;
-			}
-			heim_release(o);
-			o = heim_data_ref_create(buf, len, free);
-		    }
-		}
-	    }
-	    ctx->p += 1;
+            need = 0;
+        }
 
-	    return o;
-	}
-	ctx->p += 1;
+        if (*ctx->p != '\\') {
+            unsigned char c = *ctx->p;
+
+            /*
+             * Not backslashed -> consume now.
+             *
+             * NOTE: All cases in this block must continue or return w/ error.
+             */
+
+            /* Check for unescaped ASCII control characters */
+            if (c == '\n') {
+                if (strict)
+                    return parse_string_error(ctx, p0,
+                                              "Unescaped newline in JSON string");
+                /* Count the newline but don't add it to the decoding */
+                ctx->lineno++;
+            } else if (strict && *ctx->p <= 0x1f) {
+                return parse_string_error(ctx, p0, "Unescaped ASCII control character");
+            } else if (c == 0) {
+                binary = 1;
+            }
+            if (!strict || c < 0x80) {
+                /* ASCII, or not strict -> no need to validate */
+                *(p++) = c;
+                ctx->p++;
+                continue;
+            }
+
+            /*
+             * Being strict for parsing means we want to detect malformed UTF-8
+             * sequences.
+             *
+             * If not strict then we just go on below and add to `p' whatever
+             * bytes we find in `ctx->p' as we find them.
+             *
+             * For each two-byte sequence we need one more byte in `p[]'.  For
+             * each three-byte sequence we need two more bytes in `p[]'.
+             *
+             * Setting `need' and looping will cause `p0' to be grown.
+             *
+             * NOTE: All cases in this block must continue or return w/ error.
+             */
+            if ((c & 0xe0) == 0xc0) {
+                /* Two-byte UTF-8 encoding */
+                if (pend - p < 2) {
+                    need = 2;
+                    continue; /* realloc p0 */
+                }
+
+                *(p++) = c;
+                ctx->p++;
+                if (ctx->p == ctx->pend)
+                    return parse_string_error(ctx, p0, "Truncated UTF-8");
+                c = *(ctx->p++);
+                if ((c & 0xc0) != 0x80)
+                    return parse_string_error(ctx, p0, "Truncated UTF-8");
+                *(p++) = c;
+                continue;
+            }
+            if ((c & 0xf0) == 0xe0) {
+                /* Three-byte UTF-8 encoding */
+                if (pend - p < 3) {
+                    need = 3;
+                    continue; /* realloc p0 */
+                }
+
+                *(p++) = c;
+                ctx->p++;
+                if (ctx->p == ctx->pend)
+                    return parse_string_error(ctx, p0, "Truncated UTF-8");
+                c = *(ctx->p++);
+                if ((c & 0xc0) != 0x80)
+                    return parse_string_error(ctx, p0, "Truncated UTF-8");
+                *(p++) = c;
+                c = *(ctx->p++);
+                if ((c & 0xc0) != 0x80)
+                    return parse_string_error(ctx, p0, "Truncated UTF-8");
+                *(p++) = c;
+                continue;
+            }
+            if ((c & 0xf8) == 0xf0)
+                return parse_string_error(ctx, p0, "UTF-8 sequence not "
+                                          "encoded as escaped UTF-16");
+            if ((c & 0xc0) == 0x80)
+                return parse_string_error(ctx, p0,
+                                          "Invalid UTF-8 "
+                                          "(bare continuation code unit)");
+
+            return parse_string_error(ctx, p0, "Not UTF-8");
+        }
+
+        /* Backslash-quoted character */
+        ctx->p++;
+        if (ctx->p == ctx->pend) {
+            ctx->error =
+                heim_error_create(EINVAL,
+                                  "Unterminated JSON string at line %lu",
+                                  ctx->lineno);
+            free(p0);
+            return NULL;
+        }
+        switch (*ctx->p) {
+        /* Simple escapes */
+        case  'b': *(p++) = '\b'; ctx->p++; continue;
+        case  'f': *(p++) = '\f'; ctx->p++; continue;
+        case  'n': *(p++) = '\n'; ctx->p++; continue;
+        case  'r': *(p++) = '\r'; ctx->p++; continue;
+        case  't': *(p++) = '\t'; ctx->p++; continue;
+        case  '"': *(p++) = '"';  ctx->p++; continue;
+        case '\\': *(p++) = '\\'; ctx->p++; continue;
+        /* Escaped Unicode handled below */
+        case  'u':
+            /*
+             * Worst case for !strict we need 11 bytes for a truncated non-BMP
+             * codepoint escape.  Call it 12.
+             */
+            if (strict)
+                need = 4;
+            else
+                need = 12;
+            if (pend - p < need) {
+                /* Go back to the backslash, realloc, try again */
+                ctx->p--;
+                continue;
+            }
+
+            need = 0;
+            ctx->p++;
+            break;
+        default:
+            if (!strict) {
+                *(p++) = *ctx->p;
+                ctx->p++;
+                continue;
+            }
+            ctx->error =
+                heim_error_create(EINVAL,
+                                  "Invalid backslash escape at line %lu",
+                                  ctx->lineno);
+            free(p0);
+            return NULL;
+        }
+
+        /* Unicode code point */
+        if (pend - p < 12) {
+            need = 12;
+            ctx->p -= 2; /* for "\\u" */
+            continue; /* This will cause p0 to be realloc'ed */
+        }
+        p_save = ctx->p;
+        ctop = cbot = -3;
+        ctop = unescape_unicode(ctx);
+        if (ctop == -1 && strict)
+            return parse_string_error(ctx, p0, "Invalid escaped Unicode");
+        if (ctop == -1) {
+            /*
+             * Not strict; tolerate bad input.
+             *
+             * Output "\\u" and then loop to treat what we expected to be four
+             * digits as if they were not part of an escaped Unicode codepoint.
+             */
+            ctx->p = p_save;
+            if (p < pend)
+                *(p++) = '\\';
+            if (p < pend)
+                *(p++) = 'u';
+            continue;
+        }
+        if (ctop == 0) {
+            *(p++) = '\0';
+            binary = 1;
+            continue;
+        }
+        if (ctop < 0xd800) {
+            if (!encode_utf8(ctx, &p, pend, ctop))
+                return parse_string_error(ctx, p0,
+                                          "Internal JSON string parse error");
+            continue;
+        }
+
+        /*
+         * We parsed the top escaped codepoint of a surrogate pair encoding
+         * of a non-BMP Unicode codepoint.  What follows must be another
+         * escaped codepoint.
+         */
+        if (ctx->p < ctx->pend && ctx->p[0] == '\\')
+            ctx->p++;
+        else
+            ctop = -1;
+        if (ctop > -1 && ctx->p < ctx->pend && ctx->p[0] == 'u')
+            ctx->p++;
+        else
+            ctop = -1;
+        if (ctop > -1) {
+            /* Parse the hex digits of the bottom half of the surrogate pair */
+            cbot = unescape_unicode(ctx);
+            if (cbot == -1 || cbot < 0xdc00)
+                ctop = -1;
+        }
+        if (ctop == -1) {
+            if (strict)
+                return parse_string_error(ctx, p0,
+                                          "Invalid surrogate pair");
+
+            /*
+             * Output "\\u", rewind, output the digits of `ctop'.
+             *
+             * When we get to what should have been the bottom half of the
+             * pair we'll necessarily fail to parse it as a normal escaped
+             * Unicode codepoint, and once again, rewind and output its digits.
+             */
+            if (p < pend)
+                *(p++) = '\\';
+            if (p < pend)
+                *(p++) = 'u';
+            ctx->p = p_save;
+            continue;
+        }
+
+        /* Finally decode the surrogate pair then encode as UTF-8 */
+        ctop -= 0xd800;
+        cbot -= 0xdc00;
+        if (!encode_utf8(ctx, &p, pend, 0x10000 + ((ctop << 10) | (cbot & 0x3ff))))
+            return parse_string_error(ctx, p0,
+                                      "Internal JSON string parse error");
     }
-    out:
-    ctx->error = heim_error_create(EINVAL, "ran out of string");
-    return NULL;
+
+    if (p0 == NULL)
+        return heim_string_create("");
+
+    /* NUL-terminate for rk_base64_decode() and plain paranoia */
+    if (p0 != NULL && p == pend) {
+        char *tmp = realloc(p0, 1 + pend - p);
+
+        if (tmp == NULL) {
+            ctx->error = heim_error_create_enomem();
+            free(p0);
+            return NULL;
+        }
+        p = tmp + (p - p0);
+        pend = tmp + 1 + (pend - p0);
+        p0 = tmp;
+    }
+    *(p++) = '\0';
+
+    /* If there's embedded NULs, it's not a C string */
+    if (binary) {
+        o = heim_data_ref_create(p0, (p - 1) - p0, free);
+        return o;
+    }
+
+    /* If we can decode as base64, then let's */
+    if (ctx->flags & HEIM_JSON_F_TRY_DECODE_DATA) {
+        void *buf;
+        size_t len = p - p0;
+
+        if (len > 0)
+            len--;
+
+        if (len >= 4 && strspn(p0, base64_chars) >= len - 2) {
+            buf = malloc(len);
+            if (buf == NULL) {
+                ctx->error = heim_error_create_enomem();
+                free(p0);
+                return NULL;
+            }
+            len = rk_base64_decode(p0, buf);
+            if (len > -1) {
+                /* Yes base64, return the decoded data */
+                o = heim_data_ref_create(buf, len, free);
+                free(p0);
+                return o;
+            }
+            /* Not base64, so return what we had */
+            free(buf);
+        }
+    }
+
+    /* Sadly this will copy `p0' */
+    o = heim_string_create_with_bytes(p0, p - p0);
+    free(p0);
+    return o;
 }
 
 static int
@@ -809,3 +1304,83 @@ heim_json_copy_serialize(heim_object_t obj, heim_json_flags_t flags, heim_error_
     }
     return str;
 }
+
+struct heim_eq_f_ctx {
+    heim_dict_t other;
+    int ret;
+};
+
+static void
+heim_eq_dict_iter_f(heim_object_t key, heim_object_t val, void *d)
+{
+    struct heim_eq_f_ctx *ctx = d;
+    heim_object_t other_val;
+
+    if (!ctx->ret)
+        return;
+
+    /*
+     * This doesn't work if the key is an array or a dict, which, anyways,
+     * isn't allowed in JSON, though we allow it.
+     */
+    other_val = heim_dict_get_value(ctx->other, key);
+    ctx->ret = heim_json_eq(val, other_val);
+}
+
+int
+heim_json_eq(heim_object_t a, heim_object_t b)
+{
+    heim_tid_t atid, btid;
+
+    if (a == b)
+        return 1;
+    if (a == NULL || b == NULL)
+        return 0;
+    atid = heim_get_tid(a);
+    btid = heim_get_tid(b);
+    if (atid != btid)
+        return 0;
+    switch (atid) {
+    case HEIM_TID_ARRAY: {
+        size_t len = heim_array_get_length(b);
+        size_t i;
+
+        if (heim_array_get_length(a) != len)
+            return 0;
+        for (i = 0; i < len; i++) {
+            if (!heim_json_eq(heim_array_get_value(a, i),
+                              heim_array_get_value(b, i)))
+                return 0;
+        }
+        return 1;
+    }
+    case HEIM_TID_DICT: {
+        struct heim_eq_f_ctx ctx;
+
+        ctx.other = b;
+        ctx.ret = 1;
+        heim_dict_iterate_f(a, &ctx, heim_eq_dict_iter_f);
+
+        if (ctx.ret) {
+            ctx.other = a;
+            heim_dict_iterate_f(b, &ctx, heim_eq_dict_iter_f);
+        }
+        return ctx.ret;
+    }
+    case HEIM_TID_STRING:
+        return strcmp(heim_string_get_utf8(a), heim_string_get_utf8(b)) == 0;
+    case HEIM_TID_DATA: {
+        return heim_data_get_length(a) == heim_data_get_length(b) &&
+               memcmp(heim_data_get_ptr(a), heim_data_get_ptr(b),
+                      heim_data_get_length(a)) == 0;
+    }
+    case HEIM_TID_NUMBER:
+        return heim_number_get_long(a) == heim_number_get_long(b);
+    case HEIM_TID_NULL:
+    case HEIM_TID_BOOL:
+        return heim_bool_val(a) == heim_bool_val(b);
+    default:
+        break;
+    }
+    return 0;
+}
diff --git a/lib/base/string.c b/lib/base/string.c
index f94244716..7682fb39d 100644
--- a/lib/base/string.c
+++ b/lib/base/string.c
@@ -153,7 +153,8 @@ heim_string_create_with_bytes(const void *data, size_t len)
 
     s = _heim_alloc_object(&_heim_string_object, len + 1);
     if (s) {
-	memcpy(s, data, len);
+        if (len)
+            memcpy(s, data, len);
 	((char *)s)[len] = '\0';
     }
     return s;
diff --git a/lib/base/test_base.c b/lib/base/test_base.c
index be6c860e2..c2a27c9b2 100644
--- a/lib/base/test_base.c
+++ b/lib/base/test_base.c
@@ -244,26 +244,266 @@ test_json(void)
     };
     char *s;
     size_t i, k;
-    heim_object_t o, o2;
+    heim_object_t o, o2, o3;
     heim_string_t k1 = heim_string_create("k1");
 
     o = heim_json_create("\"string\"", 10, 0, NULL);
     heim_assert(o != NULL, "string");
     heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
     heim_assert(strcmp("string", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
+    /*
+     * Test string escaping:
+     *
+     *  - C-like must-escapes
+     *  - ASCII control character must-escapes
+     *  - surrogate pairs
+     *
+     * We test round-tripping.  First we parse, then we serialize, then parse,
+     * then compare the second parse to the first for equality.
+     *
+     * We do compare serialized forms in spite of their not being canonical.
+     * That means that some changes to serialization can cause failures here.
+     */
+    o = heim_json_create("\""
+        "\\b\\f\\n\\r\\t"   /* ASCII C-like escapes */
+        "\x1e"              /* ASCII control character w/o C-like escape */
+        "\\u00e1"           /* &aacute; */
+        "\\u07ff"
+        "\\u0801"
+        "\\u8001"
+        "\\uD834\\udd1e"    /* U+1D11E, as shown in RFC 7159 */
+        "\"", 10, 0, NULL);
+    heim_assert(o != NULL, "string");
+    heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
+    heim_assert(strcmp(
+        "\b\f\n\r\t"
+        "\x1e"
+        "\xc3\xa1"
+        "\xdf\xbf"
+        "\xe0\xA0\x81"
+        "\xe8\x80\x81"
+        "\xf0\x9d\x84\x9e", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(strcmp("\"\\b\\f\\n\\r\\t\\u001Eá߿ࠁ老\\uD834\\uDD1E\"",
+                       heim_string_get_utf8(o2)) == 0,
+                "JSON encoding changed; please check that it is till valid");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    o = heim_json_create("\""
+        "\\b\\f\\n\\r\\t"   /* ASCII C-like escapes */
+        "\x1e"              /* ASCII control character w/o C-like escape */
+        "\xc3\xa1"
+        "\xdf\xbf"
+        "\xe0\xa0\x81"
+        "\xE8\x80\x81"
+        "\\uD834\\udd1e"    /* U+1D11E, as shown in RFC 7159 */
+        "\"", 10, 0, NULL);
+    heim_assert(o != NULL, "string");
+    heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
+    heim_assert(strcmp(
+        "\b\f\n\r\t"
+        "\x1e"
+        "\xc3\xa1"
+        "\xdf\xbf"
+        "\xe0\xA0\x81"
+        "\xe8\x80\x81"
+        "\xf0\x9d\x84\x9e", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(strcmp("\"\\b\\f\\n\\r\\t\\u001Eá߿ࠁ老\\uD834\\uDD1E\"",
+                       heim_string_get_utf8(o2)) == 0,
+                "JSON encoding changed; please check that it is till valid");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    /* Test rejection of unescaped ASCII control characters */
+    o = heim_json_create("\"\b\\f\"", 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o == NULL, "strict parse accepted bad input");
+    o = heim_json_create("\"\b\x1e\"", 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o == NULL, "strict parse accepted bad input");
+
+    o = heim_json_create("\"\b\\f\"", 10, 0, NULL);
+    heim_assert(o != NULL, "string");
+    heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
+    heim_assert(strcmp("\b\f", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(strcmp("\"\\b\\f\"", heim_string_get_utf8(o2)) == 0,
+                "JSON encoding changed; please check that it is till valid");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    /* Test bogus backslash escape */
+    o = heim_json_create("\""
+        "\\ "
+        "\"", 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o == NULL, "malformed string accepted");
+    o = heim_json_create("\""
+        "\\ "
+        "\"", 10, 0, NULL);
+    heim_assert(o != NULL, "malformed string rejected (not strict)");
+    heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
+    heim_assert(strcmp(" ", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(strcmp("\" \"", heim_string_get_utf8(o2)) == 0,
+                "JSON encoding changed; please check that it is till valid");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    /* Test truncated surrogate encoding (bottom code unit) */
+    o = heim_json_create("\""
+        "\xE8\x80\x81"
+        "\\uD834\\udd"
+        "\"", 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o == NULL, "malformed string accepted");
+    o = heim_json_create("\""
+        "\xE8\x80\x81"
+        "\\uD834\\udd"
+        "\"", 10, 0, NULL);
+    heim_assert(o != NULL, "malformed string rejected (not strict)");
+    heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
+    heim_assert(strcmp(
+        "\xe8\x80\x81"
+        "\\uD834\\udd", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(strcmp("\"老\\\\uD834\\\\udd\"",
+                       heim_string_get_utf8(o2)) == 0,
+                "JSON encoding changed; please check that it is till valid");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    /* Test truncated surrogate encodings (top code unit) */
+    o = heim_json_create("\""
+        "\xE8\x80\x81"
+        "\\uD83"
+        "\"", 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o == NULL, "malformed string accepted");
+    o = heim_json_create("\""
+        "\xE8\x80\x81"
+        "\\uD83"
+        "\"", 10, 0, NULL);
+    heim_assert(o != NULL, "malformed string rejected (not strict)");
+    heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
+    heim_assert(strcmp(
+        "\xe8\x80\x81"
+        "\\uD83", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(strcmp("\"老\\\\uD83\"",
+                       heim_string_get_utf8(o2)) == 0,
+                "JSON encoding changed; please check that it is till valid");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    /*
+     * Test handling of truncated UTF-8 multi-byte sequences.
+     */
+    o = heim_json_create("\""
+        "\xE8\x80"
+        "\"", 10, 0, NULL);
+    heim_assert(o != NULL, "malformed string rejected (not strict)");
+    heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
+    heim_assert(strcmp("\xe8\x80",
+                       heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o2 == NULL, "malformed string serialized");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o3 == NULL, "malformed string accepted (not strict)");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(strcmp("\xe8\x80",
+                       heim_string_get_utf8(o3)) == 0, "wrong string");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    /* Test handling of unescaped / embedded newline */
+    o = heim_json_create("\"\n\"", 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o == NULL, "malformed string accepted (strict)");
+    o = heim_json_create("\"\n\"", 10, 0, NULL);
+    heim_assert(o != NULL, "malformed string rejected (not strict)");
+    heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
+    heim_assert(strcmp("\n", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o2 != NULL, "string not serialized");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o3 != NULL, "string not accepted");
+    heim_assert(strcmp("\n", heim_string_get_utf8(o3)) == 0, "wrong string");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    /* Test handling of embedded NULs (must decode as data, not string) */
+    o = heim_json_create("\"\\u0000\"", 10, HEIM_JSON_F_STRICT, NULL);
+    heim_assert(o != NULL, "string with NULs rejected");
+    heim_assert(heim_get_tid(o) == heim_data_get_type_id(), "data-tid");
+    heim_assert(heim_data_get_length(o) == 1, "wrong data length");
+    heim_assert(((const char *)heim_data_get_ptr(o))[0] == '\0',
+                "wrong data NUL");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    heim_assert(o2 != NULL, "data not serialized");
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10,
+                          HEIM_JSON_F_TRY_DECODE_DATA, NULL);
+    heim_assert(o3 != NULL, "data not accepted");
+    heim_assert(heim_data_get_length(o3) == 1, "wrong data length");
+    heim_assert(((const char *)heim_data_get_ptr(o3))[0] == '\0',
+                "wrong data NUL");
+    heim_release(o3);
+    heim_release(o2);
+    heim_release(o);
+
+    /*
+     * Note that the trailing ']' is not part of the JSON text (which is just a
+     * string).
+     */
     o = heim_json_create(" \"foo\\\"bar\" ]", 10, 0, NULL);
     heim_assert(o != NULL, "string");
     heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
     heim_assert(strcmp("foo\"bar", heim_string_get_utf8(o)) == 0, "wrong string");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
     o = heim_json_create(" { \"key\" : \"value\" }", 10, 0, NULL);
     heim_assert(o != NULL, "dict");
     heim_assert(heim_get_tid(o) == heim_dict_get_type_id(), "dict-tid");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
+    /*
+     * heim_json_eq() can't handle dicts with dicts as keys, so we don't check
+     * for round-tripping here
+     */
     o = heim_json_create("{ { \"k1\" : \"s1\", \"k2\" : \"s2\" } : \"s3\", "
 			 "{ \"k3\" : \"s4\" } : -1 }", 10, 0, NULL);
     heim_assert(o != NULL, "dict");
@@ -281,6 +521,11 @@ test_json(void)
     o2 = heim_dict_copy_value(o, k1);
     heim_assert(heim_get_tid(o2) == heim_string_get_type_id(), "string-tid");
     heim_release(o2);
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
     o = heim_json_create(" { \"k1\" : { \"k2\" : \"s2\" } }", 10, 0, NULL);
@@ -289,6 +534,11 @@ test_json(void)
     o2 = heim_dict_copy_value(o, k1);
     heim_assert(heim_get_tid(o2) == heim_dict_get_type_id(), "dict-tid");
     heim_release(o2);
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
     o = heim_json_create("{ \"k1\" : 1 }", 10, 0, NULL);
@@ -297,26 +547,51 @@ test_json(void)
     o2 = heim_dict_copy_value(o, k1);
     heim_assert(heim_get_tid(o2) == heim_number_get_type_id(), "number-tid");
     heim_release(o2);
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
     o = heim_json_create("-10", 10, 0, NULL);
     heim_assert(o != NULL, "number");
     heim_assert(heim_get_tid(o) == heim_number_get_type_id(), "number-tid");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
     o = heim_json_create("99", 10, 0, NULL);
     heim_assert(o != NULL, "number");
     heim_assert(heim_get_tid(o) == heim_number_get_type_id(), "number-tid");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
     o = heim_json_create(" [ 1 ]", 10, 0, NULL);
     heim_assert(o != NULL, "array");
     heim_assert(heim_get_tid(o) == heim_array_get_type_id(), "array-tid");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
     o = heim_json_create(" [ -1 ]", 10, 0, NULL);
     heim_assert(o != NULL, "array");
     heim_assert(heim_get_tid(o) == heim_array_get_type_id(), "array-tid");
+    o2 = heim_json_copy_serialize(o, 0, NULL);
+    o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+    heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+    heim_release(o3);
+    heim_release(o2);
     heim_release(o);
 
     for (i = 0; i < (sizeof (j) / sizeof (j[0])); i++) {
@@ -325,6 +600,11 @@ test_json(void)
 	    fprintf(stderr, "Failed to parse this JSON: %s\n", j[i]);
 	    return 1;
 	}
+        o2 = heim_json_copy_serialize(o, 0, NULL);
+        o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
+        heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
+        heim_release(o3);
+        heim_release(o2);
 	heim_release(o);
 	/* Simple fuzz test */
 	for (k = strlen(j[i]) - 1; k > 0; k--) {
diff --git a/lib/base/version-script.map b/lib/base/version-script.map
index 928e86199..70188985c 100644
--- a/lib/base/version-script.map
+++ b/lib/base/version-script.map
@@ -146,6 +146,7 @@ HEIMDAL_BASE_1.0 {
 		heim_json_copy_serialize;
 		heim_json_create;
 		heim_json_create_with_bytes;
+		heim_json_eq;
 		heim_load_plugins;
 		heim_log;
 		heim_log_msg;