001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.io;
020
021import java.io.IOException;
022import java.io.DataInput;
023import java.io.DataOutput;
024import java.nio.ByteBuffer;
025import java.nio.CharBuffer;
026import java.nio.charset.CharacterCodingException;
027import java.nio.charset.Charset;
028import java.nio.charset.CharsetDecoder;
029import java.nio.charset.CharsetEncoder;
030import java.nio.charset.CodingErrorAction;
031import java.nio.charset.MalformedInputException;
032import java.text.CharacterIterator;
033import java.text.StringCharacterIterator;
034import java.util.Arrays;
035
036import org.apache.avro.reflect.Stringable;
037
038import org.apache.hadoop.classification.InterfaceAudience;
039import org.apache.hadoop.classification.InterfaceStability;
040
041/** This class stores text using standard UTF8 encoding.  It provides methods
042 * to serialize, deserialize, and compare texts at byte level.  The type of
043 * length is integer and is serialized using zero-compressed format.  <p>In
044 * addition, it provides methods for string traversal without converting the
045 * byte array to a string.  <p>Also includes utilities for
046 * serializing/deserialing a string, coding/decoding a string, checking if a
047 * byte array contains valid UTF8 code, calculating the length of an encoded
048 * string.
049 */
050@Stringable
051@InterfaceAudience.Public
052@InterfaceStability.Stable
053public class Text extends BinaryComparable
054    implements WritableComparable<BinaryComparable> {
055  
056  private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
057    new ThreadLocal<CharsetEncoder>() {
058      @Override
059      protected CharsetEncoder initialValue() {
060        return Charset.forName("UTF-8").newEncoder().
061               onMalformedInput(CodingErrorAction.REPORT).
062               onUnmappableCharacter(CodingErrorAction.REPORT);
063    }
064  };
065  
066  private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
067    new ThreadLocal<CharsetDecoder>() {
068    @Override
069    protected CharsetDecoder initialValue() {
070      return Charset.forName("UTF-8").newDecoder().
071             onMalformedInput(CodingErrorAction.REPORT).
072             onUnmappableCharacter(CodingErrorAction.REPORT);
073    }
074  };
075  
076  private static final byte [] EMPTY_BYTES = new byte[0];
077  
078  private byte[] bytes;
079  private int length;
080
081  public Text() {
082    bytes = EMPTY_BYTES;
083  }
084
085  /** Construct from a string. 
086   */
087  public Text(String string) {
088    set(string);
089  }
090
091  /** Construct from another text. */
092  public Text(Text utf8) {
093    set(utf8);
094  }
095
096  /** Construct from a byte array.
097   */
098  public Text(byte[] utf8)  {
099    set(utf8);
100  }
101  
102  /**
103   * Get a copy of the bytes that is exactly the length of the data.
104   * See {@link #getBytes()} for faster access to the underlying array.
105   */
106  public byte[] copyBytes() {
107    byte[] result = new byte[length];
108    System.arraycopy(bytes, 0, result, 0, length);
109    return result;
110  }
111  
112  /**
113   * Returns the raw bytes; however, only data up to {@link #getLength()} is
114   * valid. Please use {@link #copyBytes()} if you
115   * need the returned array to be precisely the length of the data.
116   */
117  @Override
118  public byte[] getBytes() {
119    return bytes;
120  }
121
122  /** Returns the number of bytes in the byte array */ 
123  @Override
124  public int getLength() {
125    return length;
126  }
127  
128  /**
129   * Returns the Unicode Scalar Value (32-bit integer value)
130   * for the character at <code>position</code>. Note that this
131   * method avoids using the converter or doing String instantiation
132   * @return the Unicode scalar value at position or -1
133   *          if the position is invalid or points to a
134   *          trailing byte
135   */
136  public int charAt(int position) {
137    if (position > this.length) return -1; // too long
138    if (position < 0) return -1; // duh.
139      
140    ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
141    return bytesToCodePoint(bb.slice());
142  }
143  
144  public int find(String what) {
145    return find(what, 0);
146  }
147  
148  /**
149   * Finds any occurence of <code>what</code> in the backing
150   * buffer, starting as position <code>start</code>. The starting
151   * position is measured in bytes and the return value is in
152   * terms of byte position in the buffer. The backing buffer is
153   * not converted to a string for this operation.
154   * @return byte position of the first occurence of the search
155   *         string in the UTF-8 buffer or -1 if not found
156   */
157  public int find(String what, int start) {
158    try {
159      ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
160      ByteBuffer tgt = encode(what);
161      byte b = tgt.get();
162      src.position(start);
163          
164      while (src.hasRemaining()) {
165        if (b == src.get()) { // matching first byte
166          src.mark(); // save position in loop
167          tgt.mark(); // save position in target
168          boolean found = true;
169          int pos = src.position()-1;
170          while (tgt.hasRemaining()) {
171            if (!src.hasRemaining()) { // src expired first
172              tgt.reset();
173              src.reset();
174              found = false;
175              break;
176            }
177            if (!(tgt.get() == src.get())) {
178              tgt.reset();
179              src.reset();
180              found = false;
181              break; // no match
182            }
183          }
184          if (found) return pos;
185        }
186      }
187      return -1; // not found
188    } catch (CharacterCodingException e) {
189      // can't get here
190      e.printStackTrace();
191      return -1;
192    }
193  }  
194  /** Set to contain the contents of a string. 
195   */
196  public void set(String string) {
197    try {
198      ByteBuffer bb = encode(string, true);
199      bytes = bb.array();
200      length = bb.limit();
201    }catch(CharacterCodingException e) {
202      throw new RuntimeException("Should not have happened ", e); 
203    }
204  }
205
206  /** Set to a utf8 byte array
207   */
208  public void set(byte[] utf8) {
209    set(utf8, 0, utf8.length);
210  }
211  
212  /** copy a text. */
213  public void set(Text other) {
214    set(other.getBytes(), 0, other.getLength());
215  }
216
217  /**
218   * Set the Text to range of bytes
219   * @param utf8 the data to copy from
220   * @param start the first position of the new string
221   * @param len the number of bytes of the new string
222   */
223  public void set(byte[] utf8, int start, int len) {
224    setCapacity(len, false);
225    System.arraycopy(utf8, start, bytes, 0, len);
226    this.length = len;
227  }
228
229  /**
230   * Append a range of bytes to the end of the given text
231   * @param utf8 the data to copy from
232   * @param start the first position to append from utf8
233   * @param len the number of bytes to append
234   */
235  public void append(byte[] utf8, int start, int len) {
236    setCapacity(length + len, true);
237    System.arraycopy(utf8, start, bytes, length, len);
238    length += len;
239  }
240
241  /**
242   * Clear the string to empty.
243   *
244   * <em>Note</em>: For performance reasons, this call does not clear the
245   * underlying byte array that is retrievable via {@link #getBytes()}.
246   * In order to free the byte-array memory, call {@link #set(byte[])}
247   * with an empty byte array (For example, <code>new byte[0]</code>).
248   */
249  public void clear() {
250    length = 0;
251  }
252
253  /*
254   * Sets the capacity of this Text object to <em>at least</em>
255   * <code>len</code> bytes. If the current buffer is longer,
256   * then the capacity and existing content of the buffer are
257   * unchanged. If <code>len</code> is larger
258   * than the current capacity, the Text object's capacity is
259   * increased to match.
260   * @param len the number of bytes we need
261   * @param keepData should the old data be kept
262   */
263  private void setCapacity(int len, boolean keepData) {
264    if (bytes == null || bytes.length < len) {
265      if (bytes != null && keepData) {
266        bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
267      } else {
268        bytes = new byte[len];
269      }
270    }
271  }
272   
273  /** 
274   * Convert text back to string
275   * @see java.lang.Object#toString()
276   */
277  @Override
278  public String toString() {
279    try {
280      return decode(bytes, 0, length);
281    } catch (CharacterCodingException e) { 
282      throw new RuntimeException("Should not have happened " , e); 
283    }
284  }
285  
286  /** deserialize 
287   */
288  @Override
289  public void readFields(DataInput in) throws IOException {
290    int newLength = WritableUtils.readVInt(in);
291    readWithKnownLength(in, newLength);
292  }
293  
294  public void readFields(DataInput in, int maxLength) throws IOException {
295    int newLength = WritableUtils.readVInt(in);
296    if (newLength < 0) {
297      throw new IOException("tried to deserialize " + newLength +
298          " bytes of data!  newLength must be non-negative.");
299    } else if (newLength >= maxLength) {
300      throw new IOException("tried to deserialize " + newLength +
301          " bytes of data, but maxLength = " + maxLength);
302    }
303    readWithKnownLength(in, newLength);
304  }
305
306  /** Skips over one Text in the input. */
307  public static void skip(DataInput in) throws IOException {
308    int length = WritableUtils.readVInt(in);
309    WritableUtils.skipFully(in, length);
310  }
311
312  /**
313   * Read a Text object whose length is already known.
314   * This allows creating Text from a stream which uses a different serialization
315   * format.
316   */
317  public void readWithKnownLength(DataInput in, int len) throws IOException {
318    setCapacity(len, false);
319    in.readFully(bytes, 0, len);
320    length = len;
321  }
322
323  /** serialize
324   * write this object to out
325   * length uses zero-compressed encoding
326   * @see Writable#write(DataOutput)
327   */
328  @Override
329  public void write(DataOutput out) throws IOException {
330    WritableUtils.writeVInt(out, length);
331    out.write(bytes, 0, length);
332  }
333
334  public void write(DataOutput out, int maxLength) throws IOException {
335    if (length > maxLength) {
336      throw new IOException("data was too long to write!  Expected " +
337          "less than or equal to " + maxLength + " bytes, but got " +
338          length + " bytes.");
339    }
340    WritableUtils.writeVInt(out, length);
341    out.write(bytes, 0, length);
342  }
343
344  /** Returns true iff <code>o</code> is a Text with the same contents.  */
345  @Override
346  public boolean equals(Object o) {
347    if (o instanceof Text)
348      return super.equals(o);
349    return false;
350  }
351
352  @Override
353  public int hashCode() {
354    return super.hashCode();
355  }
356
357  /** A WritableComparator optimized for Text keys. */
358  public static class Comparator extends WritableComparator {
359    public Comparator() {
360      super(Text.class);
361    }
362
363    @Override
364    public int compare(byte[] b1, int s1, int l1,
365                       byte[] b2, int s2, int l2) {
366      int n1 = WritableUtils.decodeVIntSize(b1[s1]);
367      int n2 = WritableUtils.decodeVIntSize(b2[s2]);
368      return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
369    }
370  }
371
372  static {
373    // register this comparator
374    WritableComparator.define(Text.class, new Comparator());
375  }
376
377  /// STATIC UTILITIES FROM HERE DOWN
378  /**
379   * Converts the provided byte array to a String using the
380   * UTF-8 encoding. If the input is malformed,
381   * replace by a default value.
382   */
383  public static String decode(byte[] utf8) throws CharacterCodingException {
384    return decode(ByteBuffer.wrap(utf8), true);
385  }
386  
387  public static String decode(byte[] utf8, int start, int length) 
388    throws CharacterCodingException {
389    return decode(ByteBuffer.wrap(utf8, start, length), true);
390  }
391  
392  /**
393   * Converts the provided byte array to a String using the
394   * UTF-8 encoding. If <code>replace</code> is true, then
395   * malformed input is replaced with the
396   * substitution character, which is U+FFFD. Otherwise the
397   * method throws a MalformedInputException.
398   */
399  public static String decode(byte[] utf8, int start, int length, boolean replace) 
400    throws CharacterCodingException {
401    return decode(ByteBuffer.wrap(utf8, start, length), replace);
402  }
403  
404  private static String decode(ByteBuffer utf8, boolean replace) 
405    throws CharacterCodingException {
406    CharsetDecoder decoder = DECODER_FACTORY.get();
407    if (replace) {
408      decoder.onMalformedInput(
409          java.nio.charset.CodingErrorAction.REPLACE);
410      decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
411    }
412    String str = decoder.decode(utf8).toString();
413    // set decoder back to its default value: REPORT
414    if (replace) {
415      decoder.onMalformedInput(CodingErrorAction.REPORT);
416      decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
417    }
418    return str;
419  }
420
421  /**
422   * Converts the provided String to bytes using the
423   * UTF-8 encoding. If the input is malformed,
424   * invalid chars are replaced by a default value.
425   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
426   *                     and length is ByteBuffer.limit()
427   */
428
429  public static ByteBuffer encode(String string)
430    throws CharacterCodingException {
431    return encode(string, true);
432  }
433
434  /**
435   * Converts the provided String to bytes using the
436   * UTF-8 encoding. If <code>replace</code> is true, then
437   * malformed input is replaced with the
438   * substitution character, which is U+FFFD. Otherwise the
439   * method throws a MalformedInputException.
440   * @return ByteBuffer: bytes stores at ByteBuffer.array() 
441   *                     and length is ByteBuffer.limit()
442   */
443  public static ByteBuffer encode(String string, boolean replace)
444    throws CharacterCodingException {
445    CharsetEncoder encoder = ENCODER_FACTORY.get();
446    if (replace) {
447      encoder.onMalformedInput(CodingErrorAction.REPLACE);
448      encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
449    }
450    ByteBuffer bytes = 
451      encoder.encode(CharBuffer.wrap(string.toCharArray()));
452    if (replace) {
453      encoder.onMalformedInput(CodingErrorAction.REPORT);
454      encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
455    }
456    return bytes;
457  }
458
459  static final public int DEFAULT_MAX_LEN = 1024 * 1024;
460
461  /** Read a UTF8 encoded string from in
462   */
463  public static String readString(DataInput in) throws IOException {
464    return readString(in, Integer.MAX_VALUE);
465  }
466  
467  /** Read a UTF8 encoded string with a maximum size
468   */
469  public static String readString(DataInput in, int maxLength)
470      throws IOException {
471    int length = WritableUtils.readVIntInRange(in, 0, maxLength);
472    byte [] bytes = new byte[length];
473    in.readFully(bytes, 0, length);
474    return decode(bytes);
475  }
476  
477  /** Write a UTF8 encoded string to out
478   */
479  public static int writeString(DataOutput out, String s) throws IOException {
480    ByteBuffer bytes = encode(s);
481    int length = bytes.limit();
482    WritableUtils.writeVInt(out, length);
483    out.write(bytes.array(), 0, length);
484    return length;
485  }
486
487  /** Write a UTF8 encoded string with a maximum size to out
488   */
489  public static int writeString(DataOutput out, String s, int maxLength)
490      throws IOException {
491    ByteBuffer bytes = encode(s);
492    int length = bytes.limit();
493    if (length > maxLength) {
494      throw new IOException("string was too long to write!  Expected " +
495          "less than or equal to " + maxLength + " bytes, but got " +
496          length + " bytes.");
497    }
498    WritableUtils.writeVInt(out, length);
499    out.write(bytes.array(), 0, length);
500    return length;
501  }
502
503  ////// states for validateUTF8
504  
505  private static final int LEAD_BYTE = 0;
506
507  private static final int TRAIL_BYTE_1 = 1;
508
509  private static final int TRAIL_BYTE = 2;
510
511  /** 
512   * Check if a byte array contains valid utf-8
513   * @param utf8 byte array
514   * @throws MalformedInputException if the byte array contains invalid utf-8
515   */
516  public static void validateUTF8(byte[] utf8) throws MalformedInputException {
517    validateUTF8(utf8, 0, utf8.length);     
518  }
519  
520  /**
521   * Check to see if a byte array is valid utf-8
522   * @param utf8 the array of bytes
523   * @param start the offset of the first byte in the array
524   * @param len the length of the byte sequence
525   * @throws MalformedInputException if the byte array contains invalid bytes
526   */
527  public static void validateUTF8(byte[] utf8, int start, int len)
528    throws MalformedInputException {
529    int count = start;
530    int leadByte = 0;
531    int length = 0;
532    int state = LEAD_BYTE;
533    while (count < start+len) {
534      int aByte = utf8[count] & 0xFF;
535
536      switch (state) {
537      case LEAD_BYTE:
538        leadByte = aByte;
539        length = bytesFromUTF8[aByte];
540
541        switch (length) {
542        case 0: // check for ASCII
543          if (leadByte > 0x7F)
544            throw new MalformedInputException(count);
545          break;
546        case 1:
547          if (leadByte < 0xC2 || leadByte > 0xDF)
548            throw new MalformedInputException(count);
549          state = TRAIL_BYTE_1;
550          break;
551        case 2:
552          if (leadByte < 0xE0 || leadByte > 0xEF)
553            throw new MalformedInputException(count);
554          state = TRAIL_BYTE_1;
555          break;
556        case 3:
557          if (leadByte < 0xF0 || leadByte > 0xF4)
558            throw new MalformedInputException(count);
559          state = TRAIL_BYTE_1;
560          break;
561        default:
562          // too long! Longest valid UTF-8 is 4 bytes (lead + three)
563          // or if < 0 we got a trail byte in the lead byte position
564          throw new MalformedInputException(count);
565        } // switch (length)
566        break;
567
568      case TRAIL_BYTE_1:
569        if (leadByte == 0xF0 && aByte < 0x90)
570          throw new MalformedInputException(count);
571        if (leadByte == 0xF4 && aByte > 0x8F)
572          throw new MalformedInputException(count);
573        if (leadByte == 0xE0 && aByte < 0xA0)
574          throw new MalformedInputException(count);
575        if (leadByte == 0xED && aByte > 0x9F)
576          throw new MalformedInputException(count);
577        // falls through to regular trail-byte test!!
578      case TRAIL_BYTE:
579        if (aByte < 0x80 || aByte > 0xBF)
580          throw new MalformedInputException(count);
581        if (--length == 0) {
582          state = LEAD_BYTE;
583        } else {
584          state = TRAIL_BYTE;
585        }
586        break;
587      } // switch (state)
588      count++;
589    }
590  }
591
592  /**
593   * Magic numbers for UTF-8. These are the number of bytes
594   * that <em>follow</em> a given lead byte. Trailing bytes
595   * have the value -1. The values 4 and 5 are presented in
596   * this table, even though valid UTF-8 cannot include the
597   * five and six byte sequences.
598   */
599  static final int[] bytesFromUTF8 =
600  { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
601    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
602    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
603    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
604    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
605    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
606    0, 0, 0, 0, 0, 0, 0,
607    // trail bytes
608    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
609    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
610    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
611    -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
612    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613    1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
614    3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
615
616  /**
617   * Returns the next code point at the current position in
618   * the buffer. The buffer's position will be incremented.
619   * Any mark set on this buffer will be changed by this method!
620   */
621  public static int bytesToCodePoint(ByteBuffer bytes) {
622    bytes.mark();
623    byte b = bytes.get();
624    bytes.reset();
625    int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
626    if (extraBytesToRead < 0) return -1; // trailing byte!
627    int ch = 0;
628
629    switch (extraBytesToRead) {
630    case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
631    case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
632    case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
633    case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
634    case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
635    case 0: ch += (bytes.get() & 0xFF);
636    }
637    ch -= offsetsFromUTF8[extraBytesToRead];
638
639    return ch;
640  }
641
642  
643  static final int offsetsFromUTF8[] =
644  { 0x00000000, 0x00003080,
645    0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
646
647  /**
648   * For the given string, returns the number of UTF-8 bytes
649   * required to encode the string.
650   * @param string text to encode
651   * @return number of UTF-8 bytes required to encode
652   */
653  public static int utf8Length(String string) {
654    CharacterIterator iter = new StringCharacterIterator(string);
655    char ch = iter.first();
656    int size = 0;
657    while (ch != CharacterIterator.DONE) {
658      if ((ch >= 0xD800) && (ch < 0xDC00)) {
659        // surrogate pair?
660        char trail = iter.next();
661        if ((trail > 0xDBFF) && (trail < 0xE000)) {
662          // valid pair
663          size += 4;
664        } else {
665          // invalid pair
666          size += 3;
667          iter.previous(); // rewind one
668        }
669      } else if (ch < 0x80) {
670        size++;
671      } else if (ch < 0x800) {
672        size += 2;
673      } else {
674        // ch < 0x10000, that is, the largest char value
675        size += 3;
676      }
677      ch = iter.next();
678    }
679    return size;
680  }
681}