001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.io;
020
021import java.io.IOException;
022import java.io.DataInput;
023import java.io.DataOutput;
024import java.nio.ByteBuffer;
025import java.nio.CharBuffer;
026import java.nio.charset.CharacterCodingException;
027import java.nio.charset.Charset;
028import java.nio.charset.CharsetDecoder;
029import java.nio.charset.CharsetEncoder;
030import java.nio.charset.CodingErrorAction;
031import java.nio.charset.MalformedInputException;
032import java.text.CharacterIterator;
033import java.text.StringCharacterIterator;
034import java.util.Arrays;
035
036import org.apache.avro.reflect.Stringable;
037
038import org.apache.hadoop.classification.InterfaceAudience;
039import org.apache.hadoop.classification.InterfaceStability;
040
041/** This class stores text using standard UTF8 encoding.  It provides methods
042 * to serialize, deserialize, and compare texts at byte level.  The type of
043 * length is integer and is serialized using zero-compressed format.  <p>In
044 * addition, it provides methods for string traversal without converting the
045 * byte array to a string.  <p>Also includes utilities for
046 * serializing/deserialing a string, coding/decoding a string, checking if a
047 * byte array contains valid UTF8 code, calculating the length of an encoded
048 * string.
049 */
050@Stringable
051@InterfaceAudience.Public
052@InterfaceStability.Stable
053public class Text extends BinaryComparable
054    implements WritableComparable<BinaryComparable> {
055  
056  private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
057    new ThreadLocal<CharsetEncoder>() {
058      protected CharsetEncoder initialValue() {
059        return Charset.forName("UTF-8").newEncoder().
060               onMalformedInput(CodingErrorAction.REPORT).
061               onUnmappableCharacter(CodingErrorAction.REPORT);
062    }
063  };
064  
065  private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
066    new ThreadLocal<CharsetDecoder>() {
067    protected CharsetDecoder initialValue() {
068      return Charset.forName("UTF-8").newDecoder().
069             onMalformedInput(CodingErrorAction.REPORT).
070             onUnmappableCharacter(CodingErrorAction.REPORT);
071    }
072  };
073  
074  private static final byte [] EMPTY_BYTES = new byte[0];
075  
076  private byte[] bytes;
077  private int length;
078
079  public Text() {
080    bytes = EMPTY_BYTES;
081  }
082
083  /** Construct from a string. 
084   */
085  public Text(String string) {
086    set(string);
087  }
088
089  /** Construct from another text. */
090  public Text(Text utf8) {
091    set(utf8);
092  }
093
094  /** Construct from a byte array.
095   */
096  public Text(byte[] utf8)  {
097    set(utf8);
098  }
099  
100  /**
101   * Get a copy of the bytes that is exactly the length of the data.
102   * See {@link #getBytes()} for faster access to the underlying array.
103   */
104  public byte[] copyBytes() {
105    byte[] result = new byte[length];
106    System.arraycopy(bytes, 0, result, 0, length);
107    return result;
108  }
109  
110  /**
111   * Returns the raw bytes; however, only data up to {@link #getLength()} is
112   * valid. Please use {@link #copyBytes()} if you
113   * need the returned array to be precisely the length of the data.
114   */
115  public byte[] getBytes() {
116    return bytes;
117  }
118
119  /** Returns the number of bytes in the byte array */ 
120  public int getLength() {
121    return length;
122  }
123  
124  /**
125   * Returns the Unicode Scalar Value (32-bit integer value)
126   * for the character at <code>position</code>. Note that this
127   * method avoids using the converter or doing String instatiation
128   * @return the Unicode scalar value at position or -1
129   *          if the position is invalid or points to a
130   *          trailing byte
131   */
132  public int charAt(int position) {
133    if (position > this.length) return -1; // too long
134    if (position < 0) return -1; // duh.
135      
136    ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
137    return bytesToCodePoint(bb.slice());
138  }
139  
140  public int find(String what) {
141    return find(what, 0);
142  }
143  
144  /**
145   * Finds any occurence of <code>what</code> in the backing
146   * buffer, starting as position <code>start</code>. The starting
147   * position is measured in bytes and the return value is in
148   * terms of byte position in the buffer. The backing buffer is
149   * not converted to a string for this operation.
150   * @return byte position of the first occurence of the search
151   *         string in the UTF-8 buffer or -1 if not found
152   */
153  public int find(String what, int start) {
154    try {
155      ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
156      ByteBuffer tgt = encode(what);
157      byte b = tgt.get();
158      src.position(start);
159          
160      while (src.hasRemaining()) {
161        if (b == src.get()) { // matching first byte
162          src.mark(); // save position in loop
163          tgt.mark(); // save position in target
164          boolean found = true;
165          int pos = src.position()-1;
166          while (tgt.hasRemaining()) {
167            if (!src.hasRemaining()) { // src expired first
168              tgt.reset();
169              src.reset();
170              found = false;
171              break;
172            }
173            if (!(tgt.get() == src.get())) {
174              tgt.reset();
175              src.reset();
176              found = false;
177              break; // no match
178            }
179          }
180          if (found) return pos;
181        }
182      }
183      return -1; // not found
184    } catch (CharacterCodingException e) {
185      // can't get here
186      e.printStackTrace();
187      return -1;
188    }
189  }  
190  /** Set to contain the contents of a string. 
191   */
192  public void set(String string) {
193    try {
194      ByteBuffer bb = encode(string, true);
195      bytes = bb.array();
196      length = bb.limit();
197    }catch(CharacterCodingException e) {
198      throw new RuntimeException("Should not have happened ", e); 
199    }
200  }
201
202  /** Set to a utf8 byte array
203   */
204  public void set(byte[] utf8) {
205    set(utf8, 0, utf8.length);
206  }
207  
208  /** copy a text. */
209  public void set(Text other) {
210    set(other.getBytes(), 0, other.getLength());
211  }
212
213  /**
214   * Set the Text to range of bytes
215   * @param utf8 the data to copy from
216   * @param start the first position of the new string
217   * @param len the number of bytes of the new string
218   */
219  public void set(byte[] utf8, int start, int len) {
220    setCapacity(len, false);
221    System.arraycopy(utf8, start, bytes, 0, len);
222    this.length = len;
223  }
224
225  /**
226   * Append a range of bytes to the end of the given text
227   * @param utf8 the data to copy from
228   * @param start the first position to append from utf8
229   * @param len the number of bytes to append
230   */
231  public void append(byte[] utf8, int start, int len) {
232    setCapacity(length + len, true);
233    System.arraycopy(utf8, start, bytes, length, len);
234    length += len;
235  }
236
237  /**
238   * Clear the string to empty.
239   */
240  public void clear() {
241    length = 0;
242  }
243
244  /*
245   * Sets the capacity of this Text object to <em>at least</em>
246   * <code>len</code> bytes. If the current buffer is longer,
247   * then the capacity and existing content of the buffer are
248   * unchanged. If <code>len</code> is larger
249   * than the current capacity, the Text object's capacity is
250   * increased to match.
251   * @param len the number of bytes we need
252   * @param keepData should the old data be kept
253   */
254  private void setCapacity(int len, boolean keepData) {
255    if (bytes == null || bytes.length < len) {
256      if (bytes != null && keepData) {
257        bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
258      } else {
259        bytes = new byte[len];
260      }
261    }
262  }
263   
264  /** 
265   * Convert text back to string
266   * @see java.lang.Object#toString()
267   */
268  @Override
269  public String toString() {
270    try {
271      return decode(bytes, 0, length);
272    } catch (CharacterCodingException e) { 
273      throw new RuntimeException("Should not have happened " , e); 
274    }
275  }
276  
277  /** deserialize 
278   */
279  public void readFields(DataInput in) throws IOException {
280    int newLength = WritableUtils.readVInt(in);
281    setCapacity(newLength, false);
282    in.readFully(bytes, 0, newLength);
283    length = newLength;
284  }
285
286  /** Skips over one Text in the input. */
287  public static void skip(DataInput in) throws IOException {
288    int length = WritableUtils.readVInt(in);
289    WritableUtils.skipFully(in, length);
290  }
291
292  /** serialize
293   * write this object to out
294   * length uses zero-compressed encoding
295   * @see Writable#write(DataOutput)
296   */
297  public void write(DataOutput out) throws IOException {
298    WritableUtils.writeVInt(out, length);
299    out.write(bytes, 0, length);
300  }
301
302  /** Returns true iff <code>o</code> is a Text with the same contents.  */
303  public boolean equals(Object o) {
304    if (o instanceof Text)
305      return super.equals(o);
306    return false;
307  }
308
309  @Override
310  public int hashCode() {
311    return super.hashCode();
312  }
313
314  /** A WritableComparator optimized for Text keys. */
315  public static class Comparator extends WritableComparator {
316    public Comparator() {
317      super(Text.class);
318    }
319
320    public int compare(byte[] b1, int s1, int l1,
321                       byte[] b2, int s2, int l2) {
322      int n1 = WritableUtils.decodeVIntSize(b1[s1]);
323      int n2 = WritableUtils.decodeVIntSize(b2[s2]);
324      return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
325    }
326  }
327
328  static {
329    // register this comparator
330    WritableComparator.define(Text.class, new Comparator());
331  }
332
333  /// STATIC UTILITIES FROM HERE DOWN
334  /**
335   * Converts the provided byte array to a String using the
336   * UTF-8 encoding. If the input is malformed,
337   * replace by a default value.
338   */
339  public static String decode(byte[] utf8) throws CharacterCodingException {
340    return decode(ByteBuffer.wrap(utf8), true);
341  }
342  
343  public static String decode(byte[] utf8, int start, int length) 
344    throws CharacterCodingException {
345    return decode(ByteBuffer.wrap(utf8, start, length), true);
346  }
347  
348  /**
349   * Converts the provided byte array to a String using the
350   * UTF-8 encoding. If <code>replace</code> is true, then
351   * malformed input is replaced with the
352   * substitution character, which is U+FFFD. Otherwise the
353   * method throws a MalformedInputException.
354   */
355  public static String decode(byte[] utf8, int start, int length, boolean replace) 
356    throws CharacterCodingException {
357    return decode(ByteBuffer.wrap(utf8, start, length), replace);
358  }
359  
360  private static String decode(ByteBuffer utf8, boolean replace) 
361    throws CharacterCodingException {
362    CharsetDecoder decoder = DECODER_FACTORY.get();
363    if (replace) {
364      decoder.onMalformedInput(
365          java.nio.charset.CodingErrorAction.REPLACE);
366      decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
367    }
368    String str = decoder.decode(utf8).toString();
369    // set decoder back to its default value: REPORT
370    if (replace) {
371      decoder.onMalformedInput(CodingErrorAction.REPORT);
372      decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
373    }
374    return str;
375  }
376
377  /**
378   * Converts the provided String to bytes using the
379   * UTF-8 encoding. If the input is malformed,
380   * invalid chars are replaced by a default value.
381   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
382   *                     and length is ByteBuffer.limit()
383   */
384
385  public static ByteBuffer encode(String string)
386    throws CharacterCodingException {
387    return encode(string, true);
388  }
389
390  /**
391   * Converts the provided String to bytes using the
392   * UTF-8 encoding. If <code>replace</code> is true, then
393   * malformed input is replaced with the
394   * substitution character, which is U+FFFD. Otherwise the
395   * method throws a MalformedInputException.
396   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
397   *                     and length is ByteBuffer.limit()
398   */
399  public static ByteBuffer encode(String string, boolean replace)
400    throws CharacterCodingException {
401    CharsetEncoder encoder = ENCODER_FACTORY.get();
402    if (replace) {
403      encoder.onMalformedInput(CodingErrorAction.REPLACE);
404      encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
405    }
406    ByteBuffer bytes = 
407      encoder.encode(CharBuffer.wrap(string.toCharArray()));
408    if (replace) {
409      encoder.onMalformedInput(CodingErrorAction.REPORT);
410      encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
411    }
412    return bytes;
413  }
414
415  /** Read a UTF8 encoded string from in
416   */
417  public static String readString(DataInput in) throws IOException {
418    int length = WritableUtils.readVInt(in);
419    byte [] bytes = new byte[length];
420    in.readFully(bytes, 0, length);
421    return decode(bytes);
422  }
423
424  /** Write a UTF8 encoded string to out
425   */
426  public static int writeString(DataOutput out, String s) throws IOException {
427    ByteBuffer bytes = encode(s);
428    int length = bytes.limit();
429    WritableUtils.writeVInt(out, length);
430    out.write(bytes.array(), 0, length);
431    return length;
432  }
433
434  ////// states for validateUTF8
435  
436  private static final int LEAD_BYTE = 0;
437
438  private static final int TRAIL_BYTE_1 = 1;
439
440  private static final int TRAIL_BYTE = 2;
441
442  /** 
443   * Check if a byte array contains valid utf-8
444   * @param utf8 byte array
445   * @throws MalformedInputException if the byte array contains invalid utf-8
446   */
447  public static void validateUTF8(byte[] utf8) throws MalformedInputException {
448    validateUTF8(utf8, 0, utf8.length);     
449  }
450  
451  /**
452   * Check to see if a byte array is valid utf-8
453   * @param utf8 the array of bytes
454   * @param start the offset of the first byte in the array
455   * @param len the length of the byte sequence
456   * @throws MalformedInputException if the byte array contains invalid bytes
457   */
458  public static void validateUTF8(byte[] utf8, int start, int len)
459    throws MalformedInputException {
460    int count = start;
461    int leadByte = 0;
462    int length = 0;
463    int state = LEAD_BYTE;
464    while (count < start+len) {
465      int aByte = ((int) utf8[count] & 0xFF);
466
467      switch (state) {
468      case LEAD_BYTE:
469        leadByte = aByte;
470        length = bytesFromUTF8[aByte];
471
472        switch (length) {
473        case 0: // check for ASCII
474          if (leadByte > 0x7F)
475            throw new MalformedInputException(count);
476          break;
477        case 1:
478          if (leadByte < 0xC2 || leadByte > 0xDF)
479            throw new MalformedInputException(count);
480          state = TRAIL_BYTE_1;
481          break;
482        case 2:
483          if (leadByte < 0xE0 || leadByte > 0xEF)
484            throw new MalformedInputException(count);
485          state = TRAIL_BYTE_1;
486          break;
487        case 3:
488          if (leadByte < 0xF0 || leadByte > 0xF4)
489            throw new MalformedInputException(count);
490          state = TRAIL_BYTE_1;
491          break;
492        default:
493          // too long! Longest valid UTF-8 is 4 bytes (lead + three)
494          // or if < 0 we got a trail byte in the lead byte position
495          throw new MalformedInputException(count);
496        } // switch (length)
497        break;
498
499      case TRAIL_BYTE_1:
500        if (leadByte == 0xF0 && aByte < 0x90)
501          throw new MalformedInputException(count);
502        if (leadByte == 0xF4 && aByte > 0x8F)
503          throw new MalformedInputException(count);
504        if (leadByte == 0xE0 && aByte < 0xA0)
505          throw new MalformedInputException(count);
506        if (leadByte == 0xED && aByte > 0x9F)
507          throw new MalformedInputException(count);
508        // falls through to regular trail-byte test!!
509      case TRAIL_BYTE:
510        if (aByte < 0x80 || aByte > 0xBF)
511          throw new MalformedInputException(count);
512        if (--length == 0) {
513          state = LEAD_BYTE;
514        } else {
515          state = TRAIL_BYTE;
516        }
517        break;
518      } // switch (state)
519      count++;
520    }
521  }
522
523  /**
524   * Magic numbers for UTF-8. These are the number of bytes
525   * that <em>follow</em> a given lead byte. Trailing bytes
526   * have the value -1. The values 4 and 5 are presented in
527   * this table, even though valid UTF-8 cannot include the
528   * five and six byte sequences.
529   */
530  static final int[] bytesFromUTF8 =
531  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
532    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
533    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
534    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
535    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
536    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
537    0, 0, 0, 0, 0, 0, 0,
538    // trail bytes
539    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
540    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
541    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
542    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
543    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
544    1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
545    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
546
547  /**
548   * Returns the next code point at the current position in
549   * the buffer. The buffer's position will be incremented.
550   * Any mark set on this buffer will be changed by this method!
551   */
552  public static int bytesToCodePoint(ByteBuffer bytes) {
553    bytes.mark();
554    byte b = bytes.get();
555    bytes.reset();
556    int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
557    if (extraBytesToRead < 0) return -1; // trailing byte!
558    int ch = 0;
559
560    switch (extraBytesToRead) {
561    case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
562    case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
563    case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
564    case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
565    case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
566    case 0: ch += (bytes.get() & 0xFF);
567    }
568    ch -= offsetsFromUTF8[extraBytesToRead];
569
570    return ch;
571  }
572
573  
574  static final int offsetsFromUTF8[] =
575  { 0x00000000, 0x00003080,
576    0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
577
578  /**
579   * For the given string, returns the number of UTF-8 bytes
580   * required to encode the string.
581   * @param string text to encode
582   * @return number of UTF-8 bytes required to encode
583   */
584  public static int utf8Length(String string) {
585    CharacterIterator iter = new StringCharacterIterator(string);
586    char ch = iter.first();
587    int size = 0;
588    while (ch != CharacterIterator.DONE) {
589      if ((ch >= 0xD800) && (ch < 0xDC00)) {
590        // surrogate pair?
591        char trail = iter.next();
592        if ((trail > 0xDBFF) && (trail < 0xE000)) {
593          // valid pair
594          size += 4;
595        } else {
596          // invalid pair
597          size += 3;
598          iter.previous(); // rewind one
599        }
600      } else if (ch < 0x80) {
601        size++;
602      } else if (ch < 0x800) {
603        size += 2;
604      } else {
605        // ch < 0x10000, that is, the largest char value
606        size += 3;
607      }
608      ch = iter.next();
609    }
610    return size;
611  }
612}