2 * This code is copyrighted work by Daniel Luz <dev at mernen dot com>.
4 * Distributed under the Ruby and GPLv2 licenses; see COPYING and GPL files
9 import org.jruby.exceptions.RaiseException;
10 import org.jruby.runtime.ThreadContext;
11 import org.jruby.util.ByteList;
14 * A decoder that reads a JSON-encoded string from the given sources and
15 * returns its decoded form on a new ByteList. Escaped Unicode characters
16 * are encoded as UTF-8.
18 final class StringDecoder extends ByteListTranscoder {
20 * Stores the offset of the high surrogate when reading a surrogate pair,
23 private int surrogatePairStart = -1;
25 // Array used for writing multi-byte characters into the buffer at once
26 private final byte[] aux = new byte[4];
28 StringDecoder(ThreadContext context) {
32 ByteList decode(ByteList src, int start, int end) {
33 ByteList out = new ByteList(end - start);
34 init(src, start, end, out);
36 handleChar(readUtf8Char());
42 private void handleChar(int c) {
45 handleEscapeSequence();
51 private void handleEscapeSequence() {
53 switch (readUtf8Char()) {
72 if (Character.isHighSurrogate((char)cp)) {
73 handleLowSurrogate((char)cp);
74 } else if (Character.isLowSurrogate((char)cp)) {
75 // low surrogate with no high surrogate
81 default: // '\\', '"', '/'...
86 private void handleLowSurrogate(char highSurrogate) {
87 surrogatePairStart = charStart;
89 int lowSurrogate = readUtf8Char();
91 if (lowSurrogate == '\\') {
93 if (readUtf8Char() != 'u') throw invalidUtf8();
94 lowSurrogate = readHex();
97 if (Character.isLowSurrogate((char)lowSurrogate)) {
98 writeUtf8Char(Character.toCodePoint(highSurrogate,
100 surrogatePairStart = -1;
106 private void writeUtf8Char(int codePoint) {
107 if (codePoint < 0x80) {
109 } else if (codePoint < 0x800) {
110 aux[0] = (byte)(0xc0 | (codePoint >>> 6));
111 aux[1] = tailByte(codePoint & 0x3f);
113 } else if (codePoint < 0x10000) {
114 aux[0] = (byte)(0xe0 | (codePoint >>> 12));
115 aux[1] = tailByte(codePoint >>> 6);
116 aux[2] = tailByte(codePoint);
119 aux[0] = (byte)(0xf0 | codePoint >>> 18);
120 aux[1] = tailByte(codePoint >>> 12);
121 aux[2] = tailByte(codePoint >>> 6);
122 aux[3] = tailByte(codePoint);
127 private byte tailByte(int value) {
128 return (byte)(0x80 | (value & 0x3f));
132 * Reads a 4-digit unsigned hexadecimal number from the source.
134 private int readHex() {
135 int numberStart = pos;
138 for (int i = 0; i < length; i++) {
139 int digit = readUtf8Char();
141 if (digit >= '0' && digit <= '9') {
142 digitValue = digit - '0';
143 } else if (digit >= 'a' && digit <= 'f') {
144 digitValue = 10 + digit - 'a';
145 } else if (digit >= 'A' && digit <= 'F') {
146 digitValue = 10 + digit - 'A';
148 throw new NumberFormatException("Invalid base 16 number "
149 + src.subSequence(numberStart, numberStart + length));
151 result = result * 16 + digitValue;
157 protected RaiseException invalidUtf8() {
158 ByteList message = new ByteList(
159 ByteList.plain("partial character in source, " +
160 "but hit end near "));
161 int start = surrogatePairStart != -1 ? surrogatePairStart : charStart;
162 message.append(src, start, srcEnd - start);
163 return Utils.newException(context, Utils.M_PARSER_ERROR,
164 context.getRuntime().newString(message));