base: Add JSON string non-ASCII escaping options

- Add HEIM_JSON_F_ESCAPE_NON_ASCII to indicate that non-ASCII must be
   escaped as \uXXXX.

 - Add HEIM_JSON_F_NO_ESCAPE_NON_ASCII to force non-escaping of BMP
   codepoints.

 - If the locale's codeset is not UTF-8 and
   HEIM_JSON_F_NO_ESCAPE_NON_ASCII is not set, then set
   HEIM_JSON_F_ESCAPE_NON_ASCII.
This commit is contained in:
Nicolas Williams
2022-10-02 22:43:20 -05:00
parent 03f06b9472
commit 8364bdd8f3
5 changed files with 146 additions and 12 deletions

View File

@@ -37,6 +37,10 @@
#include <ctype.h>
#include <base64.h>
#ifndef WIN32
#include <langinfo.h>
#endif
static heim_base_once_t heim_json_once = HEIM_BASE_ONCE_INIT;
static heim_string_t heim_tid_data_uuid_key = NULL;
static const char base64_chars[] =
@@ -136,6 +140,54 @@ dict2json(heim_object_t key, heim_object_t value, void *ctx)
}
}
#ifndef WIN32
static void
init_is_utf8(void *ptr)
{
*(int *)ptr = strcasecmp("utf-8", nl_langinfo(CODESET)) == 0;
}
#endif
int
heim_locale_is_utf8(void)
{
#ifdef WIN32
return 0; /* XXX Implement */
#else
static int locale_is_utf8 = -1;
static heim_base_once_t once = HEIM_BASE_ONCE_INIT;
heim_base_once_f(&once, &locale_is_utf8, init_is_utf8);
return locale_is_utf8;
#endif
}
static void
out_escaped_bmp(struct twojson *j, const unsigned char *p, int nbytes)
{
unsigned char e[sizeof("\\u0000")];
unsigned codepoint;
if (nbytes == 2)
codepoint = ((p[0] & 0x1f) << 6) | (p[1] & 0x3f);
else if (nbytes == 3)
codepoint = ((p[0] & 0x0f) << 12) | ((p[1] & 0x3f) << 6) | (p[2] & 0x3f);
else
abort();
e[0] = '\\';
e[1] = 'u';
e[2] = codepoint >> 12;
e[2] += (e[2] < 10) ? '0' : ('A' - 10);
e[3] = (codepoint >> 8) & 0x0f;
e[3] += (e[3] < 10) ? '0' : ('A' - 10);
e[4] = (codepoint >> 4) & 0x0f;
e[4] += (e[4] < 10) ? '0' : ('A' - 10);
e[5] = codepoint & 0x0f;
e[5] += (e[5] < 10) ? '0' : ('A' - 10);
e[6] = '\0';
j->out(j->ctx, (char *)e);
}
static int
base2json(heim_object_t obj, struct twojson *j, int skip_indent)
{
@@ -265,6 +317,11 @@ base2json(heim_object_t obj, struct twojson *j, int skip_indent)
} else if (!good) {
return 1;
}
if (j->flags & HEIM_JSON_F_ESCAPE_NON_ASCII) {
out_escaped_bmp(j, p, 2);
p += 1;
continue;
}
e[0] = c;
e[1] = p[1];
e[2] = '\0';
@@ -289,6 +346,11 @@ base2json(heim_object_t obj, struct twojson *j, int skip_indent)
} else if (!good) {
return 1;
}
if (j->flags & HEIM_JSON_F_ESCAPE_NON_ASCII) {
out_escaped_bmp(j, p, 3);
p += 2;
continue;
}
e[0] = c;
e[1] = p[1];
e[2] = p[2];
@@ -451,6 +513,10 @@ heim_base2json(heim_object_t obj, void *ctx, heim_json_flags_t flags,
j.ret = 0;
j.first = 1;
if (!(flags & HEIM_JSON_F_NO_ESCAPE_NON_ASCII) &&
!heim_locale_is_utf8())
j.flags |= HEIM_JSON_F_ESCAPE_NON_ASCII;
return base2json(obj, &j, 0);
}