Skip to content

Commit c96351f

Browse files
LukeShubyroot
authored andcommitted
Adjust to the CVTUTF code being gone
I, Luke T. Shumaker, am the sole author of the added code. I did not reference CVTUTF when writing it. I did reference the Unicode standard (15.0.0), the Wikipedia article on UTF-8, and the Wikipedia article on UTF-16. When I saw some tests fail, I did reference the old deleted code (but a JSON-specific part, inherently not as based on CVTUTF) to determine that script_safe should also escape U+2028 and U+2029. I targeted simplicity and clarity when writing the code--it can likely be optimized. In my mind, the obvious next optimization is to have it combine contiguous non-escaped characters into just one call to fbuffer_append(), instead of calling fbuffer_append() for each character. Regarding the use of the "modern" types `uint32_t`, `uint16_t`, and `bool`: - ruby.h is guaranteed to give us uint32_t and uint16_t. - Since Ruby 3.0.0, ruby.h is guaranteed to give us bool... but we support down to Ruby 2.3. But, ruby.h is guaranteed to give us HAVE_STDBOOL_H for the C99 stdbool.h; so use that to include stdbool.h if we can, and if not then fall back to a copy of the same bool definition that Ruby 3.0.5 uses with C89.
1 parent 0819553 commit c96351f

File tree

5 files changed

+239
-144
lines changed

5 files changed

+239
-144
lines changed

ext/json/ext/generator/generator.c

+108-44
Original file line numberDiff line numberDiff line change
@@ -18,50 +18,119 @@ static ID i_to_s, i_to_json, i_new, i_indent, i_space, i_space_before,
1818
i_aref, i_send, i_respond_to_p, i_match, i_keys, i_depth,
1919
i_buffer_initial_length, i_dup, i_script_safe, i_escape_slash, i_strict;
2020

21-
/* Escapes the UTF16 character and stores the result in the buffer buf. */
22-
static void unicode_escape(char *buf, UTF16 character)
21+
/* Converts in_string to a JSON string (without the wrapping '"'
22+
* characters) in FBuffer out_buffer.
23+
*
24+
* Character are JSON-escaped according to:
25+
*
26+
* - Always: ASCII control characters (0x00-0x1F), dquote, and
27+
* backslash.
28+
*
29+
* - If out_ascii_only: non-ASCII characters (>0x7F)
30+
*
31+
* - If out_script_safe: forwardslash, line separator (U+2028), and
32+
* paragraph separator (U+2029)
33+
*
34+
* Everything else (should be UTF-8) is just passed through and
35+
* appended to the result.
36+
*/
37+
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe)
2338
{
24-
const char *digits = "0123456789abcdef";
39+
const char *hexdig = "0123456789abcdef";
40+
char scratch[12] = { '\\', 'u', 0, 0, 0, 0, '\\', 'u' };
2541

26-
buf[2] = digits[character >> 12];
27-
buf[3] = digits[(character >> 8) & 0xf];
28-
buf[4] = digits[(character >> 4) & 0xf];
29-
buf[5] = digits[character & 0xf];
30-
}
42+
const char *in_utf8_str = RSTRING_PTR(in_string);
43+
unsigned long in_utf8_len = RSTRING_LEN(in_string);
44+
bool in_is_ascii_only = rb_enc_str_asciionly_p(in_string);
3145

32-
/* Escapes the UTF16 character and stores the result in the buffer buf, then
33-
* the buffer buf is appended to the FBuffer buffer. */
34-
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16
35-
character)
36-
{
37-
unicode_escape(buf, character);
38-
fbuffer_append(buffer, buf, 6);
39-
}
46+
unsigned long pos;
4047

41-
/* Converts string to a JSON string in FBuffer buffer, where all but the ASCII
42-
* and control characters are JSON escaped. */
43-
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe)
44-
{
45-
const UTF8 *source = (UTF8 *) RSTRING_PTR(string);
46-
const UTF8 *sourceEnd = source + RSTRING_LEN(string);
47-
char buf[6] = { '\\', 'u' };
48+
for (pos = 0; pos < in_utf8_len;) {
49+
uint32_t ch;
50+
unsigned long ch_len;
51+
bool should_escape;
4852

49-
RB_GC_GUARD(string);
50-
}
53+
/* UTF-8 decoding */
54+
if (in_is_ascii_only) {
55+
ch = in_utf8_str[pos];
56+
ch_len = 1;
57+
} else {
58+
short i;
59+
if ((in_utf8_str[pos] & 0x80) == 0x00) { ch_len = 1; ch = in_utf8_str[pos]; } /* leading 1 bit is 0b0 */
60+
else if ((in_utf8_str[pos] & 0xE0) == 0xC0) { ch_len = 2; ch = in_utf8_str[pos] & 0x1F; } /* leading 3 bits are 0b110 */
61+
else if ((in_utf8_str[pos] & 0xF0) == 0xE0) { ch_len = 3; ch = in_utf8_str[pos] & 0x0F; } /* leading 4 bits are 0b1110 */
62+
else if ((in_utf8_str[pos] & 0xF8) == 0xF0) { ch_len = 4; ch = in_utf8_str[pos] & 0x07; } /* leading 5 bits are 0b11110 */
63+
else
64+
rb_raise(rb_path2class("JSON::GeneratorError"),
65+
"source sequence is illegal/malformed utf-8");
66+
if ((pos+ch_len) > in_utf8_len)
67+
rb_raise(rb_path2class("JSON::GeneratorError"),
68+
"partial character in source, but hit end");
69+
for (i = 1; i < ch_len; i++) {
70+
if ((in_utf8_str[pos+i] & 0xC0) != 0x80) /* leading 2 bits should be 0b10 */
71+
rb_raise(rb_path2class("JSON::GeneratorError"),
72+
"source sequence is illegal/malformed utf-8");
73+
ch = (ch<<6) | (in_utf8_str[pos+i] & 0x3F);
74+
}
75+
if (ch > 0x10FFFF)
76+
rb_raise(rb_path2class("JSON::GeneratorError"),
77+
"source sequence is illegal/malformed utf-8");
78+
}
5179

52-
/* Converts string to a JSON string in FBuffer buffer, where only the
53-
* characters required by the JSON standard are JSON escaped. The remaining
54-
* characters (should be UTF8) are just passed through and appended to the
55-
* result. */
56-
static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe)
57-
{
58-
const char *ptr = RSTRING_PTR(string), *p;
59-
unsigned long len = RSTRING_LEN(string), start = 0, end = 0;
60-
const char *escape = NULL;
61-
int escape_len;
62-
unsigned char c;
63-
char buf[6] = { '\\', 'u' };
64-
int ascii_only = rb_enc_str_asciionly_p(string);
80+
/* JSON policy */
81+
should_escape =
82+
(ch < 0x20) ||
83+
(ch == '"') ||
84+
(ch == '\\') ||
85+
(out_ascii_only && (ch > 0x7F)) ||
86+
(out_script_safe && (ch == '/')) ||
87+
(out_script_safe && (ch == 0x2028)) ||
88+
(out_script_safe && (ch == 0x2029));
89+
90+
/* JSON encoding */
91+
if (should_escape) {
92+
switch (ch) {
93+
case '"': fbuffer_append(out_buffer, "\\\"", 2); break;
94+
case '\\': fbuffer_append(out_buffer, "\\\\", 2); break;
95+
case '/': fbuffer_append(out_buffer, "\\/", 2); break;
96+
case '\b': fbuffer_append(out_buffer, "\\b", 2); break;
97+
case '\f': fbuffer_append(out_buffer, "\\f", 2); break;
98+
case '\n': fbuffer_append(out_buffer, "\\n", 2); break;
99+
case '\r': fbuffer_append(out_buffer, "\\r", 2); break;
100+
case '\t': fbuffer_append(out_buffer, "\\t", 2); break;
101+
default:
102+
if (ch <= 0xFFFF) {
103+
scratch[2] = hexdig[ch >> 12];
104+
scratch[3] = hexdig[(ch >> 8) & 0xf];
105+
scratch[4] = hexdig[(ch >> 4) & 0xf];
106+
scratch[5] = hexdig[ch & 0xf];
107+
fbuffer_append(out_buffer, scratch, 6);
108+
} else {
109+
uint16_t hi, lo;
110+
ch -= 0x10000;
111+
hi = 0xD800 + (uint16_t)(ch >> 10);
112+
lo = 0xDC00 + (uint16_t)(ch & 0x3FF);
113+
114+
scratch[2] = hexdig[hi >> 12];
115+
scratch[3] = hexdig[(hi >> 8) & 0xf];
116+
scratch[4] = hexdig[(hi >> 4) & 0xf];
117+
scratch[5] = hexdig[hi & 0xf];
118+
119+
scratch[8] = hexdig[lo >> 12];
120+
scratch[9] = hexdig[(lo >> 8) & 0xf];
121+
scratch[10] = hexdig[(lo >> 4) & 0xf];
122+
scratch[11] = hexdig[lo & 0xf];
123+
124+
fbuffer_append(out_buffer, scratch, 12);
125+
}
126+
}
127+
} else {
128+
fbuffer_append(out_buffer, &in_utf8_str[pos], ch_len);
129+
}
130+
131+
pos += ch_len;
132+
}
133+
RB_GC_GUARD(in_string);
65134
}
66135

67136
static char *fstrndup(const char *ptr, unsigned long len) {
@@ -698,12 +767,7 @@ static void generate_json_string(FBuffer *buffer, VALUE Vstate, JSON_Generator_S
698767
if (!enc_utf8_compatible_p(rb_enc_get(obj))) {
699768
obj = rb_str_export_to_enc(obj, rb_utf8_encoding());
700769
}
701-
702-
if (state->ascii_only) {
703-
convert_UTF8_to_JSON_ASCII(buffer, obj, state->script_safe);
704-
} else {
705-
convert_UTF8_to_JSON(buffer, obj, state->script_safe);
706-
}
770+
convert_UTF8_to_JSON(buffer, obj, state->ascii_only, state->script_safe);
707771
fbuffer_append_char(buffer, '"');
708772
}
709773

ext/json/ext/generator/generator.h

+9-4
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,14 @@
66

77
#include "ruby.h"
88

9+
#ifdef HAVE_STDBOOL_H
10+
#include <stdbool.h>
11+
#else
12+
/* This is the fallback definition from Ruby 3.0.5. */
13+
typedef unsigned char _Bool
14+
#define bool _Bool
15+
#endif
16+
917
#ifdef HAVE_RUBY_RE_H
1018
#include "ruby/re.h"
1119
#else
@@ -22,10 +30,7 @@
2230

2331
#define option_given_p(opts, key) RTEST(rb_funcall(opts, i_key_p, 1, key))
2432

25-
static void unicode_escape(char *buf, UTF16 character);
26-
static void unicode_escape_to_buffer(FBuffer *buffer, char buf[6], UTF16 character);
27-
static void convert_UTF8_to_JSON_ASCII(FBuffer *buffer, VALUE string, char script_safe);
28-
static void convert_UTF8_to_JSON(FBuffer *buffer, VALUE string, char script_safe);
33+
static void convert_UTF8_to_JSON(FBuffer *out_buffer, VALUE in_string, bool out_ascii_only, bool out_script_safe);
2934
static char *fstrndup(const char *ptr, unsigned long len);
3035

3136
/* ruby api and some helpers */

0 commit comments

Comments
 (0)