2 * This code is copyrighted work by Daniel Luz <dev at mernen dot com>.
4 * Distributed under the Ruby and GPLv2 licenses; see COPYING and GPL files
9 import org.jruby.exceptions.RaiseException;
10 import org.jruby.runtime.ThreadContext;
11 import org.jruby.util.ByteList;
14 * A class specialized in transcoding a certain String format into another,
15 * using UTF-8 ByteLists as both input and output.
17 abstract class ByteListTranscoder {
18 protected final ThreadContext context;
20 protected ByteList src;
22 /** Position where the last read character started */
23 protected int charStart;
24 /** Position of the next character to read */
29 * When a character that can be copied straight into the output is found,
30 * its index is stored on this variable, and copying is delayed until
31 * the sequence of characters that can be copied ends.
33 * <p>The variable stores -1 when not in a plain sequence.
35 private int quoteStart = -1;
37 protected ByteListTranscoder(ThreadContext context) {
38 this.context = context;
41 protected void init(ByteList src, ByteList out) {
42 this.init(src, 0, src.length(), out);
45 protected void init(ByteList src, int start, int end, ByteList out) {
48 this.charStart = start;
54 * Returns whether there are any characters left to be read.
56 protected boolean hasNext() {
61 * Returns the next character in the buffer.
64 return src.charAt(pos++);
68 * Reads an UTF-8 character from the input and returns its code point,
69 * while advancing the input position.
71 * <p>Raises an {@link #invalidUtf8()} exception if an invalid byte
74 protected int readUtf8Char() {
77 if (head <= 0x7f) { // 0b0xxxxxxx (ASCII)
80 if (head <= 0xbf) { // 0b10xxxxxx
81 throw invalidUtf8(); // tail byte with no head
83 if (head <= 0xdf) { // 0b110xxxxx
85 int cp = ((head & 0x1f) << 6)
87 if (cp < 0x0080) throw invalidUtf8();
90 if (head <= 0xef) { // 0b1110xxxx
92 int cp = ((head & 0x0f) << 12)
95 if (cp < 0x0800) throw invalidUtf8();
98 if (head <= 0xf7) { // 0b11110xxx
100 int cp = ((head & 0x07) << 18)
104 if (!Character.isValidCodePoint(cp)) throw invalidUtf8();
112 * Throws a GeneratorError if the input list doesn't have at least this
115 protected void ensureMin(int n) {
116 if (pos + n > srcEnd) throw incompleteUtf8();
120 * Reads the next byte of a multi-byte UTF-8 character and returns its
121 * contents (lower 6 bits).
123 * <p>Throws a GeneratorError if the byte is not a valid tail.
125 private int nextPart() {
127 // tail bytes must be 0b10xxxxxx
128 if ((c & 0xc0) != 0x80) throw invalidUtf8();
133 protected void quoteStart() {
134 if (quoteStart == -1) quoteStart = charStart;
138 * When in a sequence of characters that can be copied directly,
139 * interrupts the sequence and copies it to the output buffer.
141 * @param endPos The offset until which the direct character quoting should
142 * occur. You may pass {@link #pos} to quote until the most
143 * recently read character, or {@link #charStart} to quote
144 * until the character before it.
146 protected void quoteStop(int endPos) {
147 if (quoteStart != -1) {
148 out.append(src, quoteStart, endPos - quoteStart);
153 protected void append(int b) {
157 protected void append(byte[] origin, int start, int length) {
158 out.append(origin, start, length);
162 protected abstract RaiseException invalidUtf8();
164 protected RaiseException incompleteUtf8() {
165 return invalidUtf8();