001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io;
020    
021    import java.io.IOException;
022    import java.io.DataInput;
023    import java.io.DataOutput;
024    import java.nio.ByteBuffer;
025    import java.nio.CharBuffer;
026    import java.nio.charset.CharacterCodingException;
027    import java.nio.charset.Charset;
028    import java.nio.charset.CharsetDecoder;
029    import java.nio.charset.CharsetEncoder;
030    import java.nio.charset.CodingErrorAction;
031    import java.nio.charset.MalformedInputException;
032    import java.text.CharacterIterator;
033    import java.text.StringCharacterIterator;
034    import java.util.Arrays;
035    
036    import org.apache.avro.reflect.Stringable;
037    
038    import org.apache.hadoop.classification.InterfaceAudience;
039    import org.apache.hadoop.classification.InterfaceStability;
040    
041    /** This class stores text using standard UTF8 encoding.  It provides methods
042     * to serialize, deserialize, and compare texts at byte level.  The type of
043     * length is integer and is serialized using zero-compressed format.  <p>In
044     * addition, it provides methods for string traversal without converting the
045     * byte array to a string.  <p>Also includes utilities for
046     * serializing/deserialing a string, coding/decoding a string, checking if a
047     * byte array contains valid UTF8 code, calculating the length of an encoded
048     * string.
049     */
050    @Stringable
051    @InterfaceAudience.Public
052    @InterfaceStability.Stable
053    public class Text extends BinaryComparable
054        implements WritableComparable<BinaryComparable> {
055      
056      private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
057        new ThreadLocal<CharsetEncoder>() {
058          @Override
059          protected CharsetEncoder initialValue() {
060            return Charset.forName("UTF-8").newEncoder().
061                   onMalformedInput(CodingErrorAction.REPORT).
062                   onUnmappableCharacter(CodingErrorAction.REPORT);
063        }
064      };
065      
066      private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
067        new ThreadLocal<CharsetDecoder>() {
068        @Override
069        protected CharsetDecoder initialValue() {
070          return Charset.forName("UTF-8").newDecoder().
071                 onMalformedInput(CodingErrorAction.REPORT).
072                 onUnmappableCharacter(CodingErrorAction.REPORT);
073        }
074      };
075      
076      private static final byte [] EMPTY_BYTES = new byte[0];
077      
078      private byte[] bytes;
079      private int length;
080    
081      public Text() {
082        bytes = EMPTY_BYTES;
083      }
084    
085      /** Construct from a string. 
086       */
087      public Text(String string) {
088        set(string);
089      }
090    
091      /** Construct from another text. */
092      public Text(Text utf8) {
093        set(utf8);
094      }
095    
096      /** Construct from a byte array.
097       */
098      public Text(byte[] utf8)  {
099        set(utf8);
100      }
101      
102      /**
103       * Get a copy of the bytes that is exactly the length of the data.
104       * See {@link #getBytes()} for faster access to the underlying array.
105       */
106      public byte[] copyBytes() {
107        byte[] result = new byte[length];
108        System.arraycopy(bytes, 0, result, 0, length);
109        return result;
110      }
111      
112      /**
113       * Returns the raw bytes; however, only data up to {@link #getLength()} is
114       * valid. Please use {@link #copyBytes()} if you
115       * need the returned array to be precisely the length of the data.
116       */
117      @Override
118      public byte[] getBytes() {
119        return bytes;
120      }
121    
122      /** Returns the number of bytes in the byte array */ 
123      @Override
124      public int getLength() {
125        return length;
126      }
127      
128      /**
129       * Returns the Unicode Scalar Value (32-bit integer value)
130       * for the character at <code>position</code>. Note that this
131       * method avoids using the converter or doing String instantiation
132       * @return the Unicode scalar value at position or -1
133       *          if the position is invalid or points to a
134       *          trailing byte
135       */
136      public int charAt(int position) {
137        if (position > this.length) return -1; // too long
138        if (position < 0) return -1; // duh.
139          
140        ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
141        return bytesToCodePoint(bb.slice());
142      }
143      
144      public int find(String what) {
145        return find(what, 0);
146      }
147      
148      /**
149       * Finds any occurence of <code>what</code> in the backing
150       * buffer, starting as position <code>start</code>. The starting
151       * position is measured in bytes and the return value is in
152       * terms of byte position in the buffer. The backing buffer is
153       * not converted to a string for this operation.
154       * @return byte position of the first occurence of the search
155       *         string in the UTF-8 buffer or -1 if not found
156       */
157      public int find(String what, int start) {
158        try {
159          ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
160          ByteBuffer tgt = encode(what);
161          byte b = tgt.get();
162          src.position(start);
163              
164          while (src.hasRemaining()) {
165            if (b == src.get()) { // matching first byte
166              src.mark(); // save position in loop
167              tgt.mark(); // save position in target
168              boolean found = true;
169              int pos = src.position()-1;
170              while (tgt.hasRemaining()) {
171                if (!src.hasRemaining()) { // src expired first
172                  tgt.reset();
173                  src.reset();
174                  found = false;
175                  break;
176                }
177                if (!(tgt.get() == src.get())) {
178                  tgt.reset();
179                  src.reset();
180                  found = false;
181                  break; // no match
182                }
183              }
184              if (found) return pos;
185            }
186          }
187          return -1; // not found
188        } catch (CharacterCodingException e) {
189          // can't get here
190          e.printStackTrace();
191          return -1;
192        }
193      }  
194      /** Set to contain the contents of a string. 
195       */
196      public void set(String string) {
197        try {
198          ByteBuffer bb = encode(string, true);
199          bytes = bb.array();
200          length = bb.limit();
201        }catch(CharacterCodingException e) {
202          throw new RuntimeException("Should not have happened ", e); 
203        }
204      }
205    
206      /** Set to a utf8 byte array
207       */
208      public void set(byte[] utf8) {
209        set(utf8, 0, utf8.length);
210      }
211      
212      /** copy a text. */
213      public void set(Text other) {
214        set(other.getBytes(), 0, other.getLength());
215      }
216    
217      /**
218       * Set the Text to range of bytes
219       * @param utf8 the data to copy from
220       * @param start the first position of the new string
221       * @param len the number of bytes of the new string
222       */
223      public void set(byte[] utf8, int start, int len) {
224        setCapacity(len, false);
225        System.arraycopy(utf8, start, bytes, 0, len);
226        this.length = len;
227      }
228    
229      /**
230       * Append a range of bytes to the end of the given text
231       * @param utf8 the data to copy from
232       * @param start the first position to append from utf8
233       * @param len the number of bytes to append
234       */
235      public void append(byte[] utf8, int start, int len) {
236        setCapacity(length + len, true);
237        System.arraycopy(utf8, start, bytes, length, len);
238        length += len;
239      }
240    
241      /**
242       * Clear the string to empty.
243       *
244       * <em>Note</em>: For performance reasons, this call does not clear the
245       * underlying byte array that is retrievable via {@link #getBytes()}.
246       * In order to free the byte-array memory, call {@link #set(byte[])}
247       * with an empty byte array (For example, <code>new byte[0]</code>).
248       */
249      public void clear() {
250        length = 0;
251      }
252    
253      /*
254       * Sets the capacity of this Text object to <em>at least</em>
255       * <code>len</code> bytes. If the current buffer is longer,
256       * then the capacity and existing content of the buffer are
257       * unchanged. If <code>len</code> is larger
258       * than the current capacity, the Text object's capacity is
259       * increased to match.
260       * @param len the number of bytes we need
261       * @param keepData should the old data be kept
262       */
263      private void setCapacity(int len, boolean keepData) {
264        if (bytes == null || bytes.length < len) {
265          if (bytes != null && keepData) {
266            bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
267          } else {
268            bytes = new byte[len];
269          }
270        }
271      }
272       
273      /** 
274       * Convert text back to string
275       * @see java.lang.Object#toString()
276       */
277      @Override
278      public String toString() {
279        try {
280          return decode(bytes, 0, length);
281        } catch (CharacterCodingException e) { 
282          throw new RuntimeException("Should not have happened " , e); 
283        }
284      }
285      
286      /** deserialize 
287       */
288      @Override
289      public void readFields(DataInput in) throws IOException {
290        int newLength = WritableUtils.readVInt(in);
291        readWithKnownLength(in, newLength);
292      }
293      
294      public void readFields(DataInput in, int maxLength) throws IOException {
295        int newLength = WritableUtils.readVInt(in);
296        if (newLength < 0) {
297          throw new IOException("tried to deserialize " + newLength +
298              " bytes of data!  newLength must be non-negative.");
299        } else if (newLength >= maxLength) {
300          throw new IOException("tried to deserialize " + newLength +
301              " bytes of data, but maxLength = " + maxLength);
302        }
303        readWithKnownLength(in, newLength);
304      }
305    
306      /** Skips over one Text in the input. */
307      public static void skip(DataInput in) throws IOException {
308        int length = WritableUtils.readVInt(in);
309        WritableUtils.skipFully(in, length);
310      }
311    
312      /**
313       * Read a Text object whose length is already known.
314       * This allows creating Text from a stream which uses a different serialization
315       * format.
316       */
317      public void readWithKnownLength(DataInput in, int len) throws IOException {
318        setCapacity(len, false);
319        in.readFully(bytes, 0, len);
320        length = len;
321      }
322    
323      /** serialize
324       * write this object to out
325       * length uses zero-compressed encoding
326       * @see Writable#write(DataOutput)
327       */
328      @Override
329      public void write(DataOutput out) throws IOException {
330        WritableUtils.writeVInt(out, length);
331        out.write(bytes, 0, length);
332      }
333    
334      public void write(DataOutput out, int maxLength) throws IOException {
335        if (length > maxLength) {
336          throw new IOException("data was too long to write!  Expected " +
337              "less than or equal to " + maxLength + " bytes, but got " +
338              length + " bytes.");
339        }
340        WritableUtils.writeVInt(out, length);
341        out.write(bytes, 0, length);
342      }
343    
344      /** Returns true iff <code>o</code> is a Text with the same contents.  */
345      @Override
346      public boolean equals(Object o) {
347        if (o instanceof Text)
348          return super.equals(o);
349        return false;
350      }
351    
352      @Override
353      public int hashCode() {
354        return super.hashCode();
355      }
356    
357      /** A WritableComparator optimized for Text keys. */
358      public static class Comparator extends WritableComparator {
359        public Comparator() {
360          super(Text.class);
361        }
362    
363        @Override
364        public int compare(byte[] b1, int s1, int l1,
365                           byte[] b2, int s2, int l2) {
366          int n1 = WritableUtils.decodeVIntSize(b1[s1]);
367          int n2 = WritableUtils.decodeVIntSize(b2[s2]);
368          return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
369        }
370      }
371    
372      static {
373        // register this comparator
374        WritableComparator.define(Text.class, new Comparator());
375      }
376    
377      /// STATIC UTILITIES FROM HERE DOWN
378      /**
379       * Converts the provided byte array to a String using the
380       * UTF-8 encoding. If the input is malformed,
381       * replace by a default value.
382       */
383      public static String decode(byte[] utf8) throws CharacterCodingException {
384        return decode(ByteBuffer.wrap(utf8), true);
385      }
386      
387      public static String decode(byte[] utf8, int start, int length) 
388        throws CharacterCodingException {
389        return decode(ByteBuffer.wrap(utf8, start, length), true);
390      }
391      
392      /**
393       * Converts the provided byte array to a String using the
394       * UTF-8 encoding. If <code>replace</code> is true, then
395       * malformed input is replaced with the
396       * substitution character, which is U+FFFD. Otherwise the
397       * method throws a MalformedInputException.
398       */
399      public static String decode(byte[] utf8, int start, int length, boolean replace) 
400        throws CharacterCodingException {
401        return decode(ByteBuffer.wrap(utf8, start, length), replace);
402      }
403      
404      private static String decode(ByteBuffer utf8, boolean replace) 
405        throws CharacterCodingException {
406        CharsetDecoder decoder = DECODER_FACTORY.get();
407        if (replace) {
408          decoder.onMalformedInput(
409              java.nio.charset.CodingErrorAction.REPLACE);
410          decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
411        }
412        String str = decoder.decode(utf8).toString();
413        // set decoder back to its default value: REPORT
414        if (replace) {
415          decoder.onMalformedInput(CodingErrorAction.REPORT);
416          decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
417        }
418        return str;
419      }
420    
421      /**
422       * Converts the provided String to bytes using the
423       * UTF-8 encoding. If the input is malformed,
424       * invalid chars are replaced by a default value.
425       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
426       *                     and length is ByteBuffer.limit()
427       */
428    
429      public static ByteBuffer encode(String string)
430        throws CharacterCodingException {
431        return encode(string, true);
432      }
433    
434      /**
435       * Converts the provided String to bytes using the
436       * UTF-8 encoding. If <code>replace</code> is true, then
437       * malformed input is replaced with the
438       * substitution character, which is U+FFFD. Otherwise the
439       * method throws a MalformedInputException.
440       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
441       *                     and length is ByteBuffer.limit()
442       */
443      public static ByteBuffer encode(String string, boolean replace)
444        throws CharacterCodingException {
445        CharsetEncoder encoder = ENCODER_FACTORY.get();
446        if (replace) {
447          encoder.onMalformedInput(CodingErrorAction.REPLACE);
448          encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
449        }
450        ByteBuffer bytes = 
451          encoder.encode(CharBuffer.wrap(string.toCharArray()));
452        if (replace) {
453          encoder.onMalformedInput(CodingErrorAction.REPORT);
454          encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
455        }
456        return bytes;
457      }
458    
459      static final public int DEFAULT_MAX_LEN = 1024 * 1024;
460    
461      /** Read a UTF8 encoded string from in
462       */
463      public static String readString(DataInput in) throws IOException {
464        return readString(in, Integer.MAX_VALUE);
465      }
466      
467      /** Read a UTF8 encoded string with a maximum size
468       */
469      public static String readString(DataInput in, int maxLength)
470          throws IOException {
471        int length = WritableUtils.readVIntInRange(in, 0, maxLength);
472        byte [] bytes = new byte[length];
473        in.readFully(bytes, 0, length);
474        return decode(bytes);
475      }
476      
477      /** Write a UTF8 encoded string to out
478       */
479      public static int writeString(DataOutput out, String s) throws IOException {
480        ByteBuffer bytes = encode(s);
481        int length = bytes.limit();
482        WritableUtils.writeVInt(out, length);
483        out.write(bytes.array(), 0, length);
484        return length;
485      }
486    
487      /** Write a UTF8 encoded string with a maximum size to out
488       */
489      public static int writeString(DataOutput out, String s, int maxLength)
490          throws IOException {
491        ByteBuffer bytes = encode(s);
492        int length = bytes.limit();
493        if (length > maxLength) {
494          throw new IOException("string was too long to write!  Expected " +
495              "less than or equal to " + maxLength + " bytes, but got " +
496              length + " bytes.");
497        }
498        WritableUtils.writeVInt(out, length);
499        out.write(bytes.array(), 0, length);
500        return length;
501      }
502    
503      ////// states for validateUTF8
504      
505      private static final int LEAD_BYTE = 0;
506    
507      private static final int TRAIL_BYTE_1 = 1;
508    
509      private static final int TRAIL_BYTE = 2;
510    
511      /** 
512       * Check if a byte array contains valid utf-8
513       * @param utf8 byte array
514       * @throws MalformedInputException if the byte array contains invalid utf-8
515       */
516      public static void validateUTF8(byte[] utf8) throws MalformedInputException {
517        validateUTF8(utf8, 0, utf8.length);     
518      }
519      
520      /**
521       * Check to see if a byte array is valid utf-8
522       * @param utf8 the array of bytes
523       * @param start the offset of the first byte in the array
524       * @param len the length of the byte sequence
525       * @throws MalformedInputException if the byte array contains invalid bytes
526       */
527      public static void validateUTF8(byte[] utf8, int start, int len)
528        throws MalformedInputException {
529        int count = start;
530        int leadByte = 0;
531        int length = 0;
532        int state = LEAD_BYTE;
533        while (count < start+len) {
534          int aByte = utf8[count] & 0xFF;
535    
536          switch (state) {
537          case LEAD_BYTE:
538            leadByte = aByte;
539            length = bytesFromUTF8[aByte];
540    
541            switch (length) {
542            case 0: // check for ASCII
543              if (leadByte > 0x7F)
544                throw new MalformedInputException(count);
545              break;
546            case 1:
547              if (leadByte < 0xC2 || leadByte > 0xDF)
548                throw new MalformedInputException(count);
549              state = TRAIL_BYTE_1;
550              break;
551            case 2:
552              if (leadByte < 0xE0 || leadByte > 0xEF)
553                throw new MalformedInputException(count);
554              state = TRAIL_BYTE_1;
555              break;
556            case 3:
557              if (leadByte < 0xF0 || leadByte > 0xF4)
558                throw new MalformedInputException(count);
559              state = TRAIL_BYTE_1;
560              break;
561            default:
562              // too long! Longest valid UTF-8 is 4 bytes (lead + three)
563              // or if < 0 we got a trail byte in the lead byte position
564              throw new MalformedInputException(count);
565            } // switch (length)
566            break;
567    
568          case TRAIL_BYTE_1:
569            if (leadByte == 0xF0 && aByte < 0x90)
570              throw new MalformedInputException(count);
571            if (leadByte == 0xF4 && aByte > 0x8F)
572              throw new MalformedInputException(count);
573            if (leadByte == 0xE0 && aByte < 0xA0)
574              throw new MalformedInputException(count);
575            if (leadByte == 0xED && aByte > 0x9F)
576              throw new MalformedInputException(count);
577            // falls through to regular trail-byte test!!
578          case TRAIL_BYTE:
579            if (aByte < 0x80 || aByte > 0xBF)
580              throw new MalformedInputException(count);
581            if (--length == 0) {
582              state = LEAD_BYTE;
583            } else {
584              state = TRAIL_BYTE;
585            }
586            break;
587          } // switch (state)
588          count++;
589        }
590      }
591    
592      /**
593       * Magic numbers for UTF-8. These are the number of bytes
594       * that <em>follow</em> a given lead byte. Trailing bytes
595       * have the value -1. The values 4 and 5 are presented in
596       * this table, even though valid UTF-8 cannot include the
597       * five and six byte sequences.
598       */
599      static final int[] bytesFromUTF8 =
600      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
601        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
602        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
603        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
604        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
605        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
606        0, 0, 0, 0, 0, 0, 0,
607        // trail bytes
608        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
609        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
610        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
611        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
612        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
613        1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
614        3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
615    
616      /**
617       * Returns the next code point at the current position in
618       * the buffer. The buffer's position will be incremented.
619       * Any mark set on this buffer will be changed by this method!
620       */
621      public static int bytesToCodePoint(ByteBuffer bytes) {
622        bytes.mark();
623        byte b = bytes.get();
624        bytes.reset();
625        int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
626        if (extraBytesToRead < 0) return -1; // trailing byte!
627        int ch = 0;
628    
629        switch (extraBytesToRead) {
630        case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
631        case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
632        case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
633        case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
634        case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
635        case 0: ch += (bytes.get() & 0xFF);
636        }
637        ch -= offsetsFromUTF8[extraBytesToRead];
638    
639        return ch;
640      }
641    
642      
643      static final int offsetsFromUTF8[] =
644      { 0x00000000, 0x00003080,
645        0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
646    
647      /**
648       * For the given string, returns the number of UTF-8 bytes
649       * required to encode the string.
650       * @param string text to encode
651       * @return number of UTF-8 bytes required to encode
652       */
653      public static int utf8Length(String string) {
654        CharacterIterator iter = new StringCharacterIterator(string);
655        char ch = iter.first();
656        int size = 0;
657        while (ch != CharacterIterator.DONE) {
658          if ((ch >= 0xD800) && (ch < 0xDC00)) {
659            // surrogate pair?
660            char trail = iter.next();
661            if ((trail > 0xDBFF) && (trail < 0xE000)) {
662              // valid pair
663              size += 4;
664            } else {
665              // invalid pair
666              size += 3;
667              iter.previous(); // rewind one
668            }
669          } else if (ch < 0x80) {
670            size++;
671          } else if (ch < 0x800) {
672            size += 2;
673          } else {
674            // ch < 0x10000, that is, the largest char value
675            size += 3;
676          }
677          ch = iter.next();
678        }
679        return size;
680      }
681    }