001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.io;
020
021import java.io.IOException;
022import java.io.DataInput;
023import java.io.DataOutput;
024import java.io.UTFDataFormatException;
025
026import org.apache.hadoop.util.StringUtils;
027
028import org.apache.commons.logging.*;
029import org.apache.hadoop.classification.InterfaceAudience;
030import org.apache.hadoop.classification.InterfaceStability;
031
032/** A WritableComparable for strings that uses the UTF8 encoding.
033 * 
034 * <p>Also includes utilities for efficiently reading and writing UTF-8.
035 *
036 * Note that this decodes UTF-8 but actually encodes CESU-8, a variant of
037 * UTF-8: see http://en.wikipedia.org/wiki/CESU-8
038 *
039 * @deprecated replaced by Text
040 */
041@Deprecated
042@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"})
043@InterfaceStability.Stable
044public class UTF8 implements WritableComparable<UTF8> {
045  private static final Log LOG= LogFactory.getLog(UTF8.class);
046  private static final DataInputBuffer IBUF = new DataInputBuffer();
047
048  private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY =
049    new ThreadLocal<DataOutputBuffer>(){
050    @Override
051    protected DataOutputBuffer initialValue() {
052      return new DataOutputBuffer();
053    }
054  };
055
056  private static final byte[] EMPTY_BYTES = new byte[0];
057
058  private byte[] bytes = EMPTY_BYTES;
059  private int length;
060
061  public UTF8() {
062    //set("");
063  }
064
065  /** Construct from a given string. */
066  public UTF8(String string) {
067    set(string);
068  }
069
070  /** Construct from a given string. */
071  public UTF8(UTF8 utf8) {
072    set(utf8);
073  }
074
075  /** The raw bytes. */
076  public byte[] getBytes() {
077    return bytes;
078  }
079
080  /** The number of bytes in the encoded string. */
081  public int getLength() {
082    return length;
083  }
084
085  /** Set to contain the contents of a string. */
086  public void set(String string) {
087    if (string.length() > 0xffff/3) {             // maybe too long
088      LOG.warn("truncating long string: " + string.length()
089               + " chars, starting with " + string.substring(0, 20));
090      string = string.substring(0, 0xffff/3);
091    }
092
093    length = utf8Length(string);                  // compute length
094    if (length > 0xffff)                          // double-check length
095      throw new RuntimeException("string too long!");
096
097    if (bytes == null || length > bytes.length)   // grow buffer
098      bytes = new byte[length];
099
100    try {                                         // avoid sync'd allocations
101      DataOutputBuffer obuf = OBUF_FACTORY.get();
102      obuf.reset();
103      writeChars(obuf, string, 0, string.length());
104      System.arraycopy(obuf.getData(), 0, bytes, 0, length);
105    } catch (IOException e) {
106      throw new RuntimeException(e);
107    }
108  }
109
110  /** Set to contain the contents of a string. */
111  public void set(UTF8 other) {
112    length = other.length;
113    if (bytes == null || length > bytes.length)   // grow buffer
114      bytes = new byte[length];
115    System.arraycopy(other.bytes, 0, bytes, 0, length);
116  }
117
118  @Override
119  public void readFields(DataInput in) throws IOException {
120    length = in.readUnsignedShort();
121    if (bytes == null || bytes.length < length)
122      bytes = new byte[length];
123    in.readFully(bytes, 0, length);
124  }
125
126  /** Skips over one UTF8 in the input. */
127  public static void skip(DataInput in) throws IOException {
128    int length = in.readUnsignedShort();
129    WritableUtils.skipFully(in, length);
130  }
131
132  @Override
133  public void write(DataOutput out) throws IOException {
134    out.writeShort(length);
135    out.write(bytes, 0, length);
136  }
137
138  /** Compare two UTF8s. */
139  @Override
140  public int compareTo(UTF8 o) {
141    return WritableComparator.compareBytes(bytes, 0, length,
142                                           o.bytes, 0, o.length);
143  }
144
145  /** Convert to a String. */
146  @Override
147  public String toString() {
148    StringBuilder buffer = new StringBuilder(length);
149    try {
150      synchronized (IBUF) {
151        IBUF.reset(bytes, length);
152        readChars(IBUF, buffer, length);
153      }
154    } catch (IOException e) {
155      throw new RuntimeException(e);
156    }
157    return buffer.toString();
158  }
159  
160  /**
161   * Convert to a string, checking for valid UTF8.
162   * @return the converted string
163   * @throws UTFDataFormatException if the underlying bytes contain invalid
164   * UTF8 data.
165   */
166  public String toStringChecked() throws IOException {
167    StringBuilder buffer = new StringBuilder(length);
168    synchronized (IBUF) {
169      IBUF.reset(bytes, length);
170      readChars(IBUF, buffer, length);
171    }
172    return buffer.toString();
173  }
174
175  /** Returns true iff <code>o</code> is a UTF8 with the same contents.  */
176  @Override
177  public boolean equals(Object o) {
178    if (!(o instanceof UTF8))
179      return false;
180    UTF8 that = (UTF8)o;
181    if (this.length != that.length)
182      return false;
183    else
184      return WritableComparator.compareBytes(bytes, 0, length,
185                                             that.bytes, 0, that.length) == 0;
186  }
187
188  @Override
189  public int hashCode() {
190    return WritableComparator.hashBytes(bytes, length);
191  }
192
193  /** A WritableComparator optimized for UTF8 keys. */
194  public static class Comparator extends WritableComparator {
195    public Comparator() {
196      super(UTF8.class);
197    }
198
199    @Override
200    public int compare(byte[] b1, int s1, int l1,
201                       byte[] b2, int s2, int l2) {
202      int n1 = readUnsignedShort(b1, s1);
203      int n2 = readUnsignedShort(b2, s2);
204      return compareBytes(b1, s1+2, n1, b2, s2+2, n2);
205    }
206  }
207
208  static {                                        // register this comparator
209    WritableComparator.define(UTF8.class, new Comparator());
210  }
211
212  /// STATIC UTILITIES FROM HERE DOWN
213
214  /// These are probably not used much anymore, and might be removed...
215
216  /** Convert a string to a UTF-8 encoded byte array.
217   * @see String#getBytes(String)
218   */
219  public static byte[] getBytes(String string) {
220    byte[] result = new byte[utf8Length(string)];
221    try {                                         // avoid sync'd allocations
222      DataOutputBuffer obuf = OBUF_FACTORY.get();
223      obuf.reset();
224      writeChars(obuf, string, 0, string.length());
225      System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength());
226    } catch (IOException e) {
227      throw new RuntimeException(e);
228    }
229    return result;
230  }
231
232  /**
233   * Convert a UTF-8 encoded byte array back into a string.
234   *
235   * @throws IOException if the byte array is invalid UTF8
236   */
237  public static String fromBytes(byte[] bytes) throws IOException {
238    DataInputBuffer dbuf = new DataInputBuffer();
239    dbuf.reset(bytes, 0, bytes.length);
240    StringBuilder buf = new StringBuilder(bytes.length);
241    readChars(dbuf, buf, bytes.length);
242    return buf.toString();
243  }
244
245  /** Read a UTF-8 encoded string.
246   *
247   * @see DataInput#readUTF()
248   */
249  public static String readString(DataInput in) throws IOException {
250    int bytes = in.readUnsignedShort();
251    StringBuilder buffer = new StringBuilder(bytes);
252    readChars(in, buffer, bytes);
253    return buffer.toString();
254  }
255
256  private static void readChars(DataInput in, StringBuilder buffer, int nBytes)
257    throws UTFDataFormatException, IOException {
258    DataOutputBuffer obuf = OBUF_FACTORY.get();
259    obuf.reset();
260    obuf.write(in, nBytes);
261    byte[] bytes = obuf.getData();
262    int i = 0;
263    while (i < nBytes) {
264      byte b = bytes[i++];
265      if ((b & 0x80) == 0) {
266        // 0b0xxxxxxx: 1-byte sequence
267        buffer.append((char)(b & 0x7F));
268      } else if ((b & 0xE0) == 0xC0) {
269        if (i >= nBytes) {
270          throw new UTFDataFormatException("Truncated UTF8 at " +
271              StringUtils.byteToHexString(bytes, i - 1, 1));
272        }
273        // 0b110xxxxx: 2-byte sequence
274        buffer.append((char)(((b & 0x1F) << 6)
275            | (bytes[i++] & 0x3F)));
276      } else if ((b & 0xF0) == 0xE0) {
277        // 0b1110xxxx: 3-byte sequence
278        if (i + 1 >= nBytes) {
279          throw new UTFDataFormatException("Truncated UTF8 at " +
280              StringUtils.byteToHexString(bytes, i - 1, 2));
281        }
282        buffer.append((char)(((b & 0x0F) << 12)
283            | ((bytes[i++] & 0x3F) << 6)
284            |  (bytes[i++] & 0x3F)));
285      } else if ((b & 0xF8) == 0xF0) {
286        if (i + 2 >= nBytes) {
287          throw new UTFDataFormatException("Truncated UTF8 at " +
288              StringUtils.byteToHexString(bytes, i - 1, 3));
289        }
290        // 0b11110xxx: 4-byte sequence
291        int codepoint =
292            ((b & 0x07) << 18)
293          | ((bytes[i++] & 0x3F) <<  12)
294          | ((bytes[i++] & 0x3F) <<  6)
295          | ((bytes[i++] & 0x3F));
296        buffer.append(highSurrogate(codepoint))
297              .append(lowSurrogate(codepoint));
298      } else {
299        // The UTF8 standard describes 5-byte and 6-byte sequences, but
300        // these are no longer allowed as of 2003 (see RFC 3629)
301
302        // Only show the next 6 bytes max in the error code - in case the
303        // buffer is large, this will prevent an exceedingly large message.
304        int endForError = Math.min(i + 5, nBytes);
305        throw new UTFDataFormatException("Invalid UTF8 at " +
306            StringUtils.byteToHexString(bytes, i - 1, endForError));
307      }
308    }
309  }
310
311  private static char highSurrogate(int codePoint) {
312    return (char) ((codePoint >>> 10)
313        + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10)));
314  }
315
316  private static char lowSurrogate(int codePoint) {
317    return (char) ((codePoint & 0x3ff) + Character.MIN_LOW_SURROGATE);
318  }
319
320  /** Write a UTF-8 encoded string.
321   *
322   * @see DataOutput#writeUTF(String)
323   */
324  public static int writeString(DataOutput out, String s) throws IOException {
325    if (s.length() > 0xffff/3) {         // maybe too long
326      LOG.warn("truncating long string: " + s.length()
327               + " chars, starting with " + s.substring(0, 20));
328      s = s.substring(0, 0xffff/3);
329    }
330
331    int len = utf8Length(s);
332    if (len > 0xffff)                             // double-check length
333      throw new IOException("string too long!");
334      
335    out.writeShort(len);
336    writeChars(out, s, 0, s.length());
337    return len;
338  }
339
340  /** Returns the number of bytes required to write this. */
341  private static int utf8Length(String string) {
342    int stringLength = string.length();
343    int utf8Length = 0;
344    for (int i = 0; i < stringLength; i++) {
345      int c = string.charAt(i);
346      if (c <= 0x007F) {
347        utf8Length++;
348      } else if (c > 0x07FF) {
349        utf8Length += 3;
350      } else {
351        utf8Length += 2;
352      }
353    }
354    return utf8Length;
355  }
356
357  private static void writeChars(DataOutput out,
358                                 String s, int start, int length)
359    throws IOException {
360    final int end = start + length;
361    for (int i = start; i < end; i++) {
362      int code = s.charAt(i);
363      if (code <= 0x7F) {
364        out.writeByte((byte)code);
365      } else if (code <= 0x07FF) {
366        out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F)));
367        out.writeByte((byte)(0x80 |   code       & 0x3F));
368      } else {
369        out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F)));
370        out.writeByte((byte)(0x80 | ((code >>  6) & 0x3F)));
371        out.writeByte((byte)(0x80 |  (code        & 0x3F)));
372      }
373    }
374  }
375
376}