base: Add JSON string non-ASCII escaping options

- Add HEIM_JSON_F_ESCAPE_NON_ASCII to indicate that non-ASCII must be
   escaped as \uXXXX.

 - Add HEIM_JSON_F_NO_ESCAPE_NON_ASCII to force non-escaping of BMP
   codepoints.

 - If the locale's codeset is not UTF-8 and
   HEIM_JSON_F_NO_ESCAPE_NON_ASCII is not set, then set
   HEIM_JSON_F_ESCAPE_NON_ASCII.
This commit is contained in:
Nicolas Williams
2022-10-02 22:43:20 -05:00
parent 03f06b9472
commit 8364bdd8f3
5 changed files with 146 additions and 12 deletions

View File

@@ -464,10 +464,12 @@ typedef enum heim_json_flags {
HEIM_JSON_F_CNULL2JSNULL = 32,
HEIM_JSON_F_TRY_DECODE_DATA = 64,
HEIM_JSON_F_ONE_LINE = 128,
HEIM_JSON_F_ESCAPE_NON_ASCII = 256,
HEIM_JSON_F_NO_ESCAPE_NON_ASCII = 512,
/* The default is to indent with one tab */
HEIM_JSON_F_INDENT2 = 256,
HEIM_JSON_F_INDENT4 = 512,
HEIM_JSON_F_INDENT8 = 1024,
HEIM_JSON_F_INDENT2 = 1024,
HEIM_JSON_F_INDENT4 = 2048,
HEIM_JSON_F_INDENT8 = 4096,
} heim_json_flags_t;
heim_object_t heim_json_create(const char *, size_t, heim_json_flags_t,

View File

@@ -37,6 +37,10 @@
#include <ctype.h>
#include <base64.h>
#ifndef WIN32
#include <langinfo.h>
#endif
static heim_base_once_t heim_json_once = HEIM_BASE_ONCE_INIT;
static heim_string_t heim_tid_data_uuid_key = NULL;
static const char base64_chars[] =
@@ -136,6 +140,54 @@ dict2json(heim_object_t key, heim_object_t value, void *ctx)
}
}
#ifndef WIN32
static void
init_is_utf8(void *ptr)
{
*(int *)ptr = strcasecmp("utf-8", nl_langinfo(CODESET)) == 0;
}
#endif
int
heim_locale_is_utf8(void)
{
#ifdef WIN32
return 0; /* XXX Implement */
#else
static int locale_is_utf8 = -1;
static heim_base_once_t once = HEIM_BASE_ONCE_INIT;
heim_base_once_f(&once, &locale_is_utf8, init_is_utf8);
return locale_is_utf8;
#endif
}
static void
out_escaped_bmp(struct twojson *j, const unsigned char *p, int nbytes)
{
unsigned char e[sizeof("\\u0000")];
unsigned codepoint;
if (nbytes == 2)
codepoint = ((p[0] & 0x1f) << 6) | (p[1] & 0x3f);
else if (nbytes == 3)
codepoint = ((p[0] & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
else
abort();
e[0] = '\\';
e[1] = 'u';
e[2] = codepoint >> 12;
e[2] += (e[2] < 10) ? '0' : ('A' - 10);
e[3] = (codepoint >> 8) & 0x0f;
e[3] += (e[3] < 10) ? '0' : ('A' - 10);
e[4] = (codepoint >> 4) & 0x0f;
e[4] += (e[4] < 10) ? '0' : ('A' - 10);
e[5] = codepoint & 0x0f;
e[5] += (e[5] < 10) ? '0' : ('A' - 10);
e[6] = '\0';
j->out(j->ctx, (char *)e);
}
static int
base2json(heim_object_t obj, struct twojson *j, int skip_indent)
{
@@ -265,6 +317,11 @@ base2json(heim_object_t obj, struct twojson *j, int skip_indent)
} else if (!good) {
return 1;
}
if (j->flags & HEIM_JSON_F_ESCAPE_NON_ASCII) {
out_escaped_bmp(j, p, 2);
p += 1;
continue;
}
e[0] = c;
e[1] = p[1];
e[2] = '\0';
@@ -289,6 +346,11 @@ base2json(heim_object_t obj, struct twojson *j, int skip_indent)
} else if (!good) {
return 1;
}
if (j->flags & HEIM_JSON_F_ESCAPE_NON_ASCII) {
out_escaped_bmp(j, p, 3);
p += 2;
continue;
}
e[0] = c;
e[1] = p[1];
e[2] = p[2];
@@ -451,6 +513,10 @@ heim_base2json(heim_object_t obj, void *ctx, heim_json_flags_t flags,
j.ret = 0;
j.first = 1;
if (!(flags & HEIM_JSON_F_NO_ESCAPE_NON_ASCII) &&
!heim_locale_is_utf8())
j.flags |= HEIM_JSON_F_ESCAPE_NON_ASCII;
return base2json(obj, &j, 0);
}

View File

@@ -239,7 +239,7 @@ heim_string_t
__heim_string_constant(const char *_str)
{
static HEIMDAL_MUTEX mutex = HEIMDAL_MUTEX_INITIALIZER;
static heim_base_once_t once;
static heim_base_once_t once = HEIM_BASE_ONCE_INIT;
static heim_dict_t dict = NULL;
heim_string_t s, s2;

View File

@@ -53,6 +53,7 @@
#include <sys/stat.h>
#ifndef WIN32
#include <sys/file.h>
#include <locale.h>
#endif
#ifdef HAVE_IO_H
#include <io.h>
@@ -290,7 +291,9 @@ test_json(void)
"\xe0\xA0\x81"
"\xe8\x80\x81"
"\xf0\x9d\x84\x9e", heim_string_get_utf8(o)) == 0, "wrong string");
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
o2 = heim_json_copy_serialize(o,
HEIM_JSON_F_STRICT |
HEIM_JSON_F_NO_ESCAPE_NON_ASCII, NULL);
heim_assert(strcmp("\"\\b\\f\\n\\r\\t\\u001Eá߿ࠁ老\\uD834\\uDD1E\"",
heim_string_get_utf8(o2)) == 0,
"JSON encoding changed; please check that it is till valid");
@@ -319,7 +322,9 @@ test_json(void)
"\xe0\xA0\x81"
"\xe8\x80\x81"
"\xf0\x9d\x84\x9e", heim_string_get_utf8(o)) == 0, "wrong string");
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
o2 = heim_json_copy_serialize(o,
HEIM_JSON_F_STRICT |
HEIM_JSON_F_NO_ESCAPE_NON_ASCII, NULL);
heim_assert(strcmp("\"\\b\\f\\n\\r\\t\\u001Eá߿ࠁ老\\uD834\\uDD1E\"",
heim_string_get_utf8(o2)) == 0,
"JSON encoding changed; please check that it is till valid");
@@ -329,6 +334,51 @@ test_json(void)
heim_release(o2);
heim_release(o);
/*
* Test HEIM_JSON_F_ESCAPE_NON_ASCII.
*
* Also test that we get escaped non-ASCII because we're in a not-UTF-8
* locale, since we setlocale(LC_ALL, "C"), so we should escape non-ASCII
* by default.
*/
o = heim_json_create("\""
"\\b\\f\\n\\r\\t" /* ASCII C-like escapes */
"\x1e" /* ASCII control character w/o C-like escape */
"\xc3\xa1"
"\xdf\xbf"
"\xe0\xa0\x81"
"\xE8\x80\x81"
"\\uD834\\udd1e" /* U+1D11E, as shown in RFC 7159 */
"\"", 10, 0, NULL);
heim_assert(o != NULL, "string");
heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
heim_assert(strcmp(
"\b\f\n\r\t"
"\x1e"
"\xc3\xa1"
"\xdf\xbf"
"\xe0\xA0\x81"
"\xe8\x80\x81"
"\xf0\x9d\x84\x9e", heim_string_get_utf8(o)) == 0, "wrong string");
o2 = heim_json_copy_serialize(o,
HEIM_JSON_F_STRICT |
HEIM_JSON_F_ESCAPE_NON_ASCII, NULL);
heim_assert(strcmp("\"\\b\\f\\n\\r\\t\\u001E\\u00E1\\u07FF\\u0801\\u8001"
"\\uD834\\uDD1E\"",
heim_string_get_utf8(o2)) == 0,
"JSON encoding changed; please check that it is till valid");
heim_release(o2);
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
heim_assert(strcmp("\"\\b\\f\\n\\r\\t\\u001E\\u00E1\\u07FF\\u0801\\u8001"
"\\uD834\\uDD1E\"",
heim_string_get_utf8(o2)) == 0,
"JSON encoding changed; please check that it is till valid");
o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
heim_assert(heim_json_eq(o, o3), "JSON text did not round-trip");
heim_release(o3);
heim_release(o2);
heim_release(o);
/* Test rejection of unescaped ASCII control characters */
o = heim_json_create("\"\b\\f\"", 10, HEIM_JSON_F_STRICT, NULL);
heim_assert(o == NULL, "strict parse accepted bad input");
@@ -339,7 +389,9 @@ test_json(void)
heim_assert(o != NULL, "string");
heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
heim_assert(strcmp("\b\f", heim_string_get_utf8(o)) == 0, "wrong string");
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
o2 = heim_json_copy_serialize(o,
HEIM_JSON_F_STRICT |
HEIM_JSON_F_NO_ESCAPE_NON_ASCII, NULL);
heim_assert(strcmp("\"\\b\\f\"", heim_string_get_utf8(o2)) == 0,
"JSON encoding changed; please check that it is till valid");
o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
@@ -359,7 +411,9 @@ test_json(void)
heim_assert(o != NULL, "malformed string rejected (not strict)");
heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
heim_assert(strcmp(" ", heim_string_get_utf8(o)) == 0, "wrong string");
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
o2 = heim_json_copy_serialize(o,
HEIM_JSON_F_STRICT |
HEIM_JSON_F_NO_ESCAPE_NON_ASCII, NULL);
heim_assert(strcmp("\" \"", heim_string_get_utf8(o2)) == 0,
"JSON encoding changed; please check that it is till valid");
o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
@@ -383,7 +437,9 @@ test_json(void)
heim_assert(strcmp(
"\xe8\x80\x81"
"\\uD834\\udd", heim_string_get_utf8(o)) == 0, "wrong string");
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
o2 = heim_json_copy_serialize(o,
HEIM_JSON_F_STRICT |
HEIM_JSON_F_NO_ESCAPE_NON_ASCII, NULL);
heim_assert(strcmp("\"\\\\uD834\\\\udd\"",
heim_string_get_utf8(o2)) == 0,
"JSON encoding changed; please check that it is till valid");
@@ -408,7 +464,9 @@ test_json(void)
heim_assert(strcmp(
"\xe8\x80\x81"
"\\uD83", heim_string_get_utf8(o)) == 0, "wrong string");
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
o2 = heim_json_copy_serialize(o,
HEIM_JSON_F_STRICT |
HEIM_JSON_F_NO_ESCAPE_NON_ASCII, NULL);
heim_assert(strcmp("\"\\\\uD83\"",
heim_string_get_utf8(o2)) == 0,
"JSON encoding changed; please check that it is till valid");
@@ -428,9 +486,11 @@ test_json(void)
heim_assert(heim_get_tid(o) == heim_string_get_type_id(), "string-tid");
heim_assert(strcmp("\xe8\x80",
heim_string_get_utf8(o)) == 0, "wrong string");
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_STRICT, NULL);
o2 = heim_json_copy_serialize(o,
HEIM_JSON_F_STRICT |
HEIM_JSON_F_NO_ESCAPE_NON_ASCII, NULL);
heim_assert(o2 == NULL, "malformed string serialized");
o2 = heim_json_copy_serialize(o, 0, NULL);
o2 = heim_json_copy_serialize(o, HEIM_JSON_F_NO_ESCAPE_NON_ASCII, NULL);
o3 = heim_json_create(heim_string_get_utf8(o2), 10, HEIM_JSON_F_STRICT, NULL);
heim_assert(o3 == NULL, "malformed string accepted (not strict)");
o3 = heim_json_create(heim_string_get_utf8(o2), 10, 0, NULL);
@@ -1225,6 +1285,11 @@ main(int argc, char **argv)
{
int res = 0;
#ifndef WIN32
setlocale(LC_ALL, "C");
heim_assert(!heim_locale_is_utf8(), "setlocale(LC_ALL, \"C\") failed?");
#endif
res |= test_memory();
res |= test_mutex();
res |= test_rwlock();

View File

@@ -148,6 +148,7 @@ HEIMDAL_BASE_1.0 {
heim_json_create_with_bytes;
heim_json_eq;
heim_load_plugins;
heim_locale_is_utf8;
heim_log;
heim_log_msg;
_heim_make_permanent;