001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.io;
020    
021    import java.io.IOException;
022    import java.io.DataInput;
023    import java.io.DataOutput;
024    import java.nio.ByteBuffer;
025    import java.nio.CharBuffer;
026    import java.nio.charset.CharacterCodingException;
027    import java.nio.charset.Charset;
028    import java.nio.charset.CharsetDecoder;
029    import java.nio.charset.CharsetEncoder;
030    import java.nio.charset.CodingErrorAction;
031    import java.nio.charset.MalformedInputException;
032    import java.text.CharacterIterator;
033    import java.text.StringCharacterIterator;
034    import java.util.Arrays;
035    
036    import org.apache.avro.reflect.Stringable;
037    
038    import org.apache.hadoop.classification.InterfaceAudience;
039    import org.apache.hadoop.classification.InterfaceStability;
040    
041    /** This class stores text using standard UTF8 encoding.  It provides methods
042     * to serialize, deserialize, and compare texts at byte level.  The type of
043     * length is integer and is serialized using zero-compressed format.  <p>In
044     * addition, it provides methods for string traversal without converting the
045     * byte array to a string.  <p>Also includes utilities for
046     * serializing/deserialing a string, coding/decoding a string, checking if a
047     * byte array contains valid UTF8 code, calculating the length of an encoded
048     * string.
049     */
050    @Stringable
051    @InterfaceAudience.Public
052    @InterfaceStability.Stable
053    public class Text extends BinaryComparable
054        implements WritableComparable<BinaryComparable> {
055      
056      private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY =
057        new ThreadLocal<CharsetEncoder>() {
058          @Override
059          protected CharsetEncoder initialValue() {
060            return Charset.forName("UTF-8").newEncoder().
061                   onMalformedInput(CodingErrorAction.REPORT).
062                   onUnmappableCharacter(CodingErrorAction.REPORT);
063        }
064      };
065      
066      private static ThreadLocal<CharsetDecoder> DECODER_FACTORY =
067        new ThreadLocal<CharsetDecoder>() {
068        @Override
069        protected CharsetDecoder initialValue() {
070          return Charset.forName("UTF-8").newDecoder().
071                 onMalformedInput(CodingErrorAction.REPORT).
072                 onUnmappableCharacter(CodingErrorAction.REPORT);
073        }
074      };
075      
076      private static final byte [] EMPTY_BYTES = new byte[0];
077      
078      private byte[] bytes;
079      private int length;
080    
081      public Text() {
082        bytes = EMPTY_BYTES;
083      }
084    
085      /** Construct from a string. 
086       */
087      public Text(String string) {
088        set(string);
089      }
090    
091      /** Construct from another text. */
092      public Text(Text utf8) {
093        set(utf8);
094      }
095    
096      /** Construct from a byte array.
097       */
098      public Text(byte[] utf8)  {
099        set(utf8);
100      }
101      
102      /**
103       * Get a copy of the bytes that is exactly the length of the data.
104       * See {@link #getBytes()} for faster access to the underlying array.
105       */
106      public byte[] copyBytes() {
107        byte[] result = new byte[length];
108        System.arraycopy(bytes, 0, result, 0, length);
109        return result;
110      }
111      
112      /**
113       * Returns the raw bytes; however, only data up to {@link #getLength()} is
114       * valid. Please use {@link #copyBytes()} if you
115       * need the returned array to be precisely the length of the data.
116       */
117      @Override
118      public byte[] getBytes() {
119        return bytes;
120      }
121    
122      /** Returns the number of bytes in the byte array */ 
123      @Override
124      public int getLength() {
125        return length;
126      }
127      
128      /**
129       * Returns the Unicode Scalar Value (32-bit integer value)
130       * for the character at <code>position</code>. Note that this
131       * method avoids using the converter or doing String instantiation
132       * @return the Unicode scalar value at position or -1
133       *          if the position is invalid or points to a
134       *          trailing byte
135       */
136      public int charAt(int position) {
137        if (position > this.length) return -1; // too long
138        if (position < 0) return -1; // duh.
139          
140        ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position);
141        return bytesToCodePoint(bb.slice());
142      }
143      
144      public int find(String what) {
145        return find(what, 0);
146      }
147      
148      /**
149       * Finds any occurence of <code>what</code> in the backing
150       * buffer, starting as position <code>start</code>. The starting
151       * position is measured in bytes and the return value is in
152       * terms of byte position in the buffer. The backing buffer is
153       * not converted to a string for this operation.
154       * @return byte position of the first occurence of the search
155       *         string in the UTF-8 buffer or -1 if not found
156       */
157      public int find(String what, int start) {
158        try {
159          ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length);
160          ByteBuffer tgt = encode(what);
161          byte b = tgt.get();
162          src.position(start);
163              
164          while (src.hasRemaining()) {
165            if (b == src.get()) { // matching first byte
166              src.mark(); // save position in loop
167              tgt.mark(); // save position in target
168              boolean found = true;
169              int pos = src.position()-1;
170              while (tgt.hasRemaining()) {
171                if (!src.hasRemaining()) { // src expired first
172                  tgt.reset();
173                  src.reset();
174                  found = false;
175                  break;
176                }
177                if (!(tgt.get() == src.get())) {
178                  tgt.reset();
179                  src.reset();
180                  found = false;
181                  break; // no match
182                }
183              }
184              if (found) return pos;
185            }
186          }
187          return -1; // not found
188        } catch (CharacterCodingException e) {
189          // can't get here
190          e.printStackTrace();
191          return -1;
192        }
193      }  
194      /** Set to contain the contents of a string. 
195       */
196      public void set(String string) {
197        try {
198          ByteBuffer bb = encode(string, true);
199          bytes = bb.array();
200          length = bb.limit();
201        }catch(CharacterCodingException e) {
202          throw new RuntimeException("Should not have happened ", e); 
203        }
204      }
205    
206      /** Set to a utf8 byte array
207       */
208      public void set(byte[] utf8) {
209        set(utf8, 0, utf8.length);
210      }
211      
212      /** copy a text. */
213      public void set(Text other) {
214        set(other.getBytes(), 0, other.getLength());
215      }
216    
217      /**
218       * Set the Text to range of bytes
219       * @param utf8 the data to copy from
220       * @param start the first position of the new string
221       * @param len the number of bytes of the new string
222       */
223      public void set(byte[] utf8, int start, int len) {
224        setCapacity(len, false);
225        System.arraycopy(utf8, start, bytes, 0, len);
226        this.length = len;
227      }
228    
229      /**
230       * Append a range of bytes to the end of the given text
231       * @param utf8 the data to copy from
232       * @param start the first position to append from utf8
233       * @param len the number of bytes to append
234       */
235      public void append(byte[] utf8, int start, int len) {
236        setCapacity(length + len, true);
237        System.arraycopy(utf8, start, bytes, length, len);
238        length += len;
239      }
240    
241      /**
242       * Clear the string to empty.
243       *
244       * <em>Note</em>: For performance reasons, this call does not clear the
245       * underlying byte array that is retrievable via {@link #getBytes()}.
246       * In order to free the byte-array memory, call {@link #set(byte[])}
247       * with an empty byte array (For example, <code>new byte[0]</code>).
248       */
249      public void clear() {
250        length = 0;
251      }
252    
253      /*
254       * Sets the capacity of this Text object to <em>at least</em>
255       * <code>len</code> bytes. If the current buffer is longer,
256       * then the capacity and existing content of the buffer are
257       * unchanged. If <code>len</code> is larger
258       * than the current capacity, the Text object's capacity is
259       * increased to match.
260       * @param len the number of bytes we need
261       * @param keepData should the old data be kept
262       */
263      private void setCapacity(int len, boolean keepData) {
264        if (bytes == null || bytes.length < len) {
265          if (bytes != null && keepData) {
266            bytes = Arrays.copyOf(bytes, Math.max(len,length << 1));
267          } else {
268            bytes = new byte[len];
269          }
270        }
271      }
272       
273      /** 
274       * Convert text back to string
275       * @see java.lang.Object#toString()
276       */
277      @Override
278      public String toString() {
279        try {
280          return decode(bytes, 0, length);
281        } catch (CharacterCodingException e) { 
282          throw new RuntimeException("Should not have happened " , e); 
283        }
284      }
285      
286      /** deserialize 
287       */
288      @Override
289      public void readFields(DataInput in) throws IOException {
290        int newLength = WritableUtils.readVInt(in);
291        setCapacity(newLength, false);
292        in.readFully(bytes, 0, newLength);
293        length = newLength;
294      }
295      
296      public void readFields(DataInput in, int maxLength) throws IOException {
297        int newLength = WritableUtils.readVInt(in);
298        if (newLength < 0) {
299          throw new IOException("tried to deserialize " + newLength +
300              " bytes of data!  newLength must be non-negative.");
301        } else if (newLength >= maxLength) {
302          throw new IOException("tried to deserialize " + newLength +
303              " bytes of data, but maxLength = " + maxLength);
304        }
305        setCapacity(newLength, false);
306        in.readFully(bytes, 0, newLength);
307        length = newLength;
308      }
309    
310      /** Skips over one Text in the input. */
311      public static void skip(DataInput in) throws IOException {
312        int length = WritableUtils.readVInt(in);
313        WritableUtils.skipFully(in, length);
314      }
315    
316      /** serialize
317       * write this object to out
318       * length uses zero-compressed encoding
319       * @see Writable#write(DataOutput)
320       */
321      @Override
322      public void write(DataOutput out) throws IOException {
323        WritableUtils.writeVInt(out, length);
324        out.write(bytes, 0, length);
325      }
326    
327      public void write(DataOutput out, int maxLength) throws IOException {
328        if (length > maxLength) {
329          throw new IOException("data was too long to write!  Expected " +
330              "less than or equal to " + maxLength + " bytes, but got " +
331              length + " bytes.");
332        }
333        WritableUtils.writeVInt(out, length);
334        out.write(bytes, 0, length);
335      }
336    
337      /** Returns true iff <code>o</code> is a Text with the same contents.  */
338      @Override
339      public boolean equals(Object o) {
340        if (o instanceof Text)
341          return super.equals(o);
342        return false;
343      }
344    
345      @Override
346      public int hashCode() {
347        return super.hashCode();
348      }
349    
350      /** A WritableComparator optimized for Text keys. */
351      public static class Comparator extends WritableComparator {
352        public Comparator() {
353          super(Text.class);
354        }
355    
356        @Override
357        public int compare(byte[] b1, int s1, int l1,
358                           byte[] b2, int s2, int l2) {
359          int n1 = WritableUtils.decodeVIntSize(b1[s1]);
360          int n2 = WritableUtils.decodeVIntSize(b2[s2]);
361          return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
362        }
363      }
364    
365      static {
366        // register this comparator
367        WritableComparator.define(Text.class, new Comparator());
368      }
369    
370      /// STATIC UTILITIES FROM HERE DOWN
371      /**
372       * Converts the provided byte array to a String using the
373       * UTF-8 encoding. If the input is malformed,
374       * replace by a default value.
375       */
376      public static String decode(byte[] utf8) throws CharacterCodingException {
377        return decode(ByteBuffer.wrap(utf8), true);
378      }
379      
380      public static String decode(byte[] utf8, int start, int length) 
381        throws CharacterCodingException {
382        return decode(ByteBuffer.wrap(utf8, start, length), true);
383      }
384      
385      /**
386       * Converts the provided byte array to a String using the
387       * UTF-8 encoding. If <code>replace</code> is true, then
388       * malformed input is replaced with the
389       * substitution character, which is U+FFFD. Otherwise the
390       * method throws a MalformedInputException.
391       */
392      public static String decode(byte[] utf8, int start, int length, boolean replace) 
393        throws CharacterCodingException {
394        return decode(ByteBuffer.wrap(utf8, start, length), replace);
395      }
396      
397      private static String decode(ByteBuffer utf8, boolean replace) 
398        throws CharacterCodingException {
399        CharsetDecoder decoder = DECODER_FACTORY.get();
400        if (replace) {
401          decoder.onMalformedInput(
402              java.nio.charset.CodingErrorAction.REPLACE);
403          decoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
404        }
405        String str = decoder.decode(utf8).toString();
406        // set decoder back to its default value: REPORT
407        if (replace) {
408          decoder.onMalformedInput(CodingErrorAction.REPORT);
409          decoder.onUnmappableCharacter(CodingErrorAction.REPORT);
410        }
411        return str;
412      }
413    
414      /**
415       * Converts the provided String to bytes using the
416       * UTF-8 encoding. If the input is malformed,
417       * invalid chars are replaced by a default value.
418       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
419       *                     and length is ByteBuffer.limit()
420       */
421    
422      public static ByteBuffer encode(String string)
423        throws CharacterCodingException {
424        return encode(string, true);
425      }
426    
427      /**
428       * Converts the provided String to bytes using the
429       * UTF-8 encoding. If <code>replace</code> is true, then
430       * malformed input is replaced with the
431       * substitution character, which is U+FFFD. Otherwise the
432       * method throws a MalformedInputException.
433       * @return ByteBuffer: bytes stores at ByteBuffer.array() 
434       *                     and length is ByteBuffer.limit()
435       */
436      public static ByteBuffer encode(String string, boolean replace)
437        throws CharacterCodingException {
438        CharsetEncoder encoder = ENCODER_FACTORY.get();
439        if (replace) {
440          encoder.onMalformedInput(CodingErrorAction.REPLACE);
441          encoder.onUnmappableCharacter(CodingErrorAction.REPLACE);
442        }
443        ByteBuffer bytes = 
444          encoder.encode(CharBuffer.wrap(string.toCharArray()));
445        if (replace) {
446          encoder.onMalformedInput(CodingErrorAction.REPORT);
447          encoder.onUnmappableCharacter(CodingErrorAction.REPORT);
448        }
449        return bytes;
450      }
451    
452      static final public int DEFAULT_MAX_LEN = 1024 * 1024;
453    
454      /** Read a UTF8 encoded string from in
455       */
456      public static String readString(DataInput in) throws IOException {
457        return readString(in, Integer.MAX_VALUE);
458      }
459      
460      /** Read a UTF8 encoded string with a maximum size
461       */
462      public static String readString(DataInput in, int maxLength)
463          throws IOException {
464        int length = WritableUtils.readVIntInRange(in, 0, maxLength);
465        byte [] bytes = new byte[length];
466        in.readFully(bytes, 0, length);
467        return decode(bytes);
468      }
469      
470      /** Write a UTF8 encoded string to out
471       */
472      public static int writeString(DataOutput out, String s) throws IOException {
473        ByteBuffer bytes = encode(s);
474        int length = bytes.limit();
475        WritableUtils.writeVInt(out, length);
476        out.write(bytes.array(), 0, length);
477        return length;
478      }
479    
480      /** Write a UTF8 encoded string with a maximum size to out
481       */
482      public static int writeString(DataOutput out, String s, int maxLength)
483          throws IOException {
484        ByteBuffer bytes = encode(s);
485        int length = bytes.limit();
486        if (length > maxLength) {
487          throw new IOException("string was too long to write!  Expected " +
488              "less than or equal to " + maxLength + " bytes, but got " +
489              length + " bytes.");
490        }
491        WritableUtils.writeVInt(out, length);
492        out.write(bytes.array(), 0, length);
493        return length;
494      }
495    
496      ////// states for validateUTF8
497      
498      private static final int LEAD_BYTE = 0;
499    
500      private static final int TRAIL_BYTE_1 = 1;
501    
502      private static final int TRAIL_BYTE = 2;
503    
504      /** 
505       * Check if a byte array contains valid utf-8
506       * @param utf8 byte array
507       * @throws MalformedInputException if the byte array contains invalid utf-8
508       */
509      public static void validateUTF8(byte[] utf8) throws MalformedInputException {
510        validateUTF8(utf8, 0, utf8.length);     
511      }
512      
513      /**
514       * Check to see if a byte array is valid utf-8
515       * @param utf8 the array of bytes
516       * @param start the offset of the first byte in the array
517       * @param len the length of the byte sequence
518       * @throws MalformedInputException if the byte array contains invalid bytes
519       */
520      public static void validateUTF8(byte[] utf8, int start, int len)
521        throws MalformedInputException {
522        int count = start;
523        int leadByte = 0;
524        int length = 0;
525        int state = LEAD_BYTE;
526        while (count < start+len) {
527          int aByte = utf8[count] & 0xFF;
528    
529          switch (state) {
530          case LEAD_BYTE:
531            leadByte = aByte;
532            length = bytesFromUTF8[aByte];
533    
534            switch (length) {
535            case 0: // check for ASCII
536              if (leadByte > 0x7F)
537                throw new MalformedInputException(count);
538              break;
539            case 1:
540              if (leadByte < 0xC2 || leadByte > 0xDF)
541                throw new MalformedInputException(count);
542              state = TRAIL_BYTE_1;
543              break;
544            case 2:
545              if (leadByte < 0xE0 || leadByte > 0xEF)
546                throw new MalformedInputException(count);
547              state = TRAIL_BYTE_1;
548              break;
549            case 3:
550              if (leadByte < 0xF0 || leadByte > 0xF4)
551                throw new MalformedInputException(count);
552              state = TRAIL_BYTE_1;
553              break;
554            default:
555              // too long! Longest valid UTF-8 is 4 bytes (lead + three)
556              // or if < 0 we got a trail byte in the lead byte position
557              throw new MalformedInputException(count);
558            } // switch (length)
559            break;
560    
561          case TRAIL_BYTE_1:
562            if (leadByte == 0xF0 && aByte < 0x90)
563              throw new MalformedInputException(count);
564            if (leadByte == 0xF4 && aByte > 0x8F)
565              throw new MalformedInputException(count);
566            if (leadByte == 0xE0 && aByte < 0xA0)
567              throw new MalformedInputException(count);
568            if (leadByte == 0xED && aByte > 0x9F)
569              throw new MalformedInputException(count);
570            // falls through to regular trail-byte test!!
571          case TRAIL_BYTE:
572            if (aByte < 0x80 || aByte > 0xBF)
573              throw new MalformedInputException(count);
574            if (--length == 0) {
575              state = LEAD_BYTE;
576            } else {
577              state = TRAIL_BYTE;
578            }
579            break;
580          } // switch (state)
581          count++;
582        }
583      }
584    
585      /**
586       * Magic numbers for UTF-8. These are the number of bytes
587       * that <em>follow</em> a given lead byte. Trailing bytes
588       * have the value -1. The values 4 and 5 are presented in
589       * this table, even though valid UTF-8 cannot include the
590       * five and six byte sequences.
591       */
592      static final int[] bytesFromUTF8 =
593      { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
594        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
595        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
596        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
597        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
598        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
599        0, 0, 0, 0, 0, 0, 0,
600        // trail bytes
601        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
602        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
603        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
604        -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1,
605        1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
606        1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3,
607        3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 };
608    
609      /**
610       * Returns the next code point at the current position in
611       * the buffer. The buffer's position will be incremented.
612       * Any mark set on this buffer will be changed by this method!
613       */
614      public static int bytesToCodePoint(ByteBuffer bytes) {
615        bytes.mark();
616        byte b = bytes.get();
617        bytes.reset();
618        int extraBytesToRead = bytesFromUTF8[(b & 0xFF)];
619        if (extraBytesToRead < 0) return -1; // trailing byte!
620        int ch = 0;
621    
622        switch (extraBytesToRead) {
623        case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
624        case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */
625        case 3: ch += (bytes.get() & 0xFF); ch <<= 6;
626        case 2: ch += (bytes.get() & 0xFF); ch <<= 6;
627        case 1: ch += (bytes.get() & 0xFF); ch <<= 6;
628        case 0: ch += (bytes.get() & 0xFF);
629        }
630        ch -= offsetsFromUTF8[extraBytesToRead];
631    
632        return ch;
633      }
634    
635      
636      static final int offsetsFromUTF8[] =
637      { 0x00000000, 0x00003080,
638        0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 };
639    
640      /**
641       * For the given string, returns the number of UTF-8 bytes
642       * required to encode the string.
643       * @param string text to encode
644       * @return number of UTF-8 bytes required to encode
645       */
646      public static int utf8Length(String string) {
647        CharacterIterator iter = new StringCharacterIterator(string);
648        char ch = iter.first();
649        int size = 0;
650        while (ch != CharacterIterator.DONE) {
651          if ((ch >= 0xD800) && (ch < 0xDC00)) {
652            // surrogate pair?
653            char trail = iter.next();
654            if ((trail > 0xDBFF) && (trail < 0xE000)) {
655              // valid pair
656              size += 4;
657            } else {
658              // invalid pair
659              size += 3;
660              iter.previous(); // rewind one
661            }
662          } else if (ch < 0x80) {
663            size++;
664          } else if (ch < 0x800) {
665            size += 2;
666          } else {
667            // ch < 0x10000, that is, the largest char value
668            size += 3;
669          }
670          ch = iter.next();
671        }
672        return size;
673      }
674    }