001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.io; 020 021import java.io.IOException; 022import java.io.DataInput; 023import java.io.DataOutput; 024import java.nio.ByteBuffer; 025import java.nio.CharBuffer; 026import java.nio.charset.CharacterCodingException; 027import java.nio.charset.Charset; 028import java.nio.charset.CharsetDecoder; 029import java.nio.charset.CharsetEncoder; 030import java.nio.charset.CodingErrorAction; 031import java.nio.charset.MalformedInputException; 032import java.text.CharacterIterator; 033import java.text.StringCharacterIterator; 034import java.util.Arrays; 035 036import org.apache.avro.reflect.Stringable; 037 038import org.apache.hadoop.classification.InterfaceAudience; 039import org.apache.hadoop.classification.InterfaceStability; 040 041/** This class stores text using standard UTF8 encoding. It provides methods 042 * to serialize, deserialize, and compare texts at byte level. The type of 043 * length is integer and is serialized using zero-compressed format. <p>In 044 * addition, it provides methods for string traversal without converting the 045 * byte array to a string. <p>Also includes utilities for 046 * serializing/deserialing a string, coding/decoding a string, checking if a 047 * byte array contains valid UTF8 code, calculating the length of an encoded 048 * string. 049 */ 050@Stringable 051@InterfaceAudience.Public 052@InterfaceStability.Stable 053public class Text extends BinaryComparable 054 implements WritableComparable<BinaryComparable> { 055 056 private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = 057 new ThreadLocal<CharsetEncoder>() { 058 @Override 059 protected CharsetEncoder initialValue() { 060 return Charset.forName("UTF-8").newEncoder(). 061 onMalformedInput(CodingErrorAction.REPORT). 062 onUnmappableCharacter(CodingErrorAction.REPORT); 063 } 064 }; 065 066 private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = 067 new ThreadLocal<CharsetDecoder>() { 068 @Override 069 protected CharsetDecoder initialValue() { 070 return Charset.forName("UTF-8").newDecoder(). 071 onMalformedInput(CodingErrorAction.REPORT). 072 onUnmappableCharacter(CodingErrorAction.REPORT); 073 } 074 }; 075 076 private static final byte [] EMPTY_BYTES = new byte[0]; 077 078 private byte[] bytes; 079 private int length; 080 081 public Text() { 082 bytes = EMPTY_BYTES; 083 } 084 085 /** Construct from a string. 086 */ 087 public Text(String string) { 088 set(string); 089 } 090 091 /** Construct from another text. */ 092 public Text(Text utf8) { 093 set(utf8); 094 } 095 096 /** Construct from a byte array. 097 */ 098 public Text(byte[] utf8) { 099 set(utf8); 100 } 101 102 /** 103 * Get a copy of the bytes that is exactly the length of the data. 104 * See {@link #getBytes()} for faster access to the underlying array. 105 */ 106 public byte[] copyBytes() { 107 byte[] result = new byte[length]; 108 System.arraycopy(bytes, 0, result, 0, length); 109 return result; 110 } 111 112 /** 113 * Returns the raw bytes; however, only data up to {@link #getLength()} is 114 * valid. Please use {@link #copyBytes()} if you 115 * need the returned array to be precisely the length of the data. 116 */ 117 @Override 118 public byte[] getBytes() { 119 return bytes; 120 } 121 122 /** Returns the number of bytes in the byte array */ 123 @Override 124 public int getLength() { 125 return length; 126 } 127 128 /** 129 * Returns the Unicode Scalar Value (32-bit integer value) 130 * for the character at <code>position</code>. Note that this 131 * method avoids using the converter or doing String instantiation 132 * @return the Unicode scalar value at position or -1 133 * if the position is invalid or points to a 134 * trailing byte 135 */ 136 public int charAt(int position) { 137 if (position > this.length) return -1; // too long 138 if (position < 0) return -1; // duh. 139 140 ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position); 141 return bytesToCodePoint(bb.slice()); 142 } 143 144 public int find(String what) { 145 return find(what, 0); 146 } 147 148 /** 149 * Finds any occurence of <code>what</code> in the backing 150 * buffer, starting as position <code>start</code>. The starting 151 * position is measured in bytes and the return value is in 152 * terms of byte position in the buffer. The backing buffer is 153 * not converted to a string for this operation. 154 * @return byte position of the first occurence of the search 155 * string in the UTF-8 buffer or -1 if not found 156 */ 157 public int find(String what, int start) { 158 try { 159 ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length); 160 ByteBuffer tgt = encode(what); 161 byte b = tgt.get(); 162 src.position(start); 163 164 while (src.hasRemaining()) { 165 if (b == src.get()) { // matching first byte 166 src.mark(); // save position in loop 167 tgt.mark(); // save position in target 168 boolean found = true; 169 int pos = src.position()-1; 170 while (tgt.hasRemaining()) { 171 if (!src.hasRemaining()) { // src expired first 172 tgt.reset(); 173 src.reset(); 174 found = false; 175 break; 176 } 177 if (!(tgt.get() == src.get())) { 178 tgt.reset(); 179 src.reset(); 180 found = false; 181 break; // no match 182 } 183 } 184 if (found) return pos; 185 } 186 } 187 return -1; // not found 188 } catch (CharacterCodingException e) { 189 // can't get here 190 e.printStackTrace(); 191 return -1; 192 } 193 } 194 /** Set to contain the contents of a string. 195 */ 196 public void set(String string) { 197 try { 198 ByteBuffer bb = encode(string, true); 199 bytes = bb.array(); 200 length = bb.limit(); 201 }catch(CharacterCodingException e) { 202 throw new RuntimeException("Should not have happened ", e); 203 } 204 } 205 206 /** Set to a utf8 byte array 207 */ 208 public void set(byte[] utf8) { 209 set(utf8, 0, utf8.length); 210 } 211 212 /** copy a text. */ 213 public void set(Text other) { 214 set(other.getBytes(), 0, other.getLength()); 215 } 216 217 /** 218 * Set the Text to range of bytes 219 * @param utf8 the data to copy from 220 * @param start the first position of the new string 221 * @param len the number of bytes of the new string 222 */ 223 public void set(byte[] utf8, int start, int len) { 224 setCapacity(len, false); 225 System.arraycopy(utf8, start, bytes, 0, len); 226 this.length = len; 227 } 228 229 /** 230 * Append a range of bytes to the end of the given text 231 * @param utf8 the data to copy from 232 * @param start the first position to append from utf8 233 * @param len the number of bytes to append 234 */ 235 public void append(byte[] utf8, int start, int len) { 236 setCapacity(length + len, true); 237 System.arraycopy(utf8, start, bytes, length, len); 238 length += len; 239 } 240 241 /** 242 * Clear the string to empty. 243 * 244 * <em>Note</em>: For performance reasons, this call does not clear the 245 * underlying byte array that is retrievable via {@link #getBytes()}. 246 * In order to free the byte-array memory, call {@link #set(byte[])} 247 * with an empty byte array (For example, <code>new byte[0]</code>). 248 */ 249 public void clear() { 250 length = 0; 251 } 252 253 /* 254 * Sets the capacity of this Text object to <em>at least</em> 255 * <code>len</code> bytes. If the current buffer is longer, 256 * then the capacity and existing content of the buffer are 257 * unchanged. If <code>len</code> is larger 258 * than the current capacity, the Text object's capacity is 259 * increased to match. 260 * @param len the number of bytes we need 261 * @param keepData should the old data be kept 262 */ 263 private void setCapacity(int len, boolean keepData) { 264 if (bytes == null || bytes.length < len) { 265 if (bytes != null && keepData) { 266 bytes = Arrays.copyOf(bytes, Math.max(len,length << 1)); 267 } else { 268 bytes = new byte[len]; 269 } 270 } 271 } 272 273 /** 274 * Convert text back to string 275 * @see java.lang.Object#toString() 276 */ 277 @Override 278 public String toString() { 279 try { 280 return decode(bytes, 0, length); 281 } catch (CharacterCodingException e) { 282 throw new RuntimeException("Should not have happened " , e); 283 } 284 } 285 286 /** deserialize 287 */ 288 @Override 289 public void readFields(DataInput in) throws IOException { 290 int newLength = WritableUtils.readVInt(in); 291 readWithKnownLength(in, newLength); 292 } 293 294 public void readFields(DataInput in, int maxLength) throws IOException { 295 int newLength = WritableUtils.readVInt(in); 296 if (newLength < 0) { 297 throw new IOException("tried to deserialize " + newLength + 298 " bytes of data! newLength must be non-negative."); 299 } else if (newLength >= maxLength) { 300 throw new IOException("tried to deserialize " + newLength + 301 " bytes of data, but maxLength = " + maxLength); 302 } 303 readWithKnownLength(in, newLength); 304 } 305 306 /** Skips over one Text in the input. */ 307 public static void skip(DataInput in) throws IOException { 308 int length = WritableUtils.readVInt(in); 309 WritableUtils.skipFully(in, length); 310 } 311 312 /** 313 * Read a Text object whose length is already known. 314 * This allows creating Text from a stream which uses a different serialization 315 * format. 316 */ 317 public void readWithKnownLength(DataInput in, int len) throws IOException { 318 setCapacity(len, false); 319 in.readFully(bytes, 0, len); 320 length = len; 321 } 322 323 /** serialize 324 * write this object to out 325 * length uses zero-compressed encoding 326 * @see Writable#write(DataOutput) 327 */ 328 @Override 329 public void write(DataOutput out) throws IOException { 330 WritableUtils.writeVInt(out, length); 331 out.write(bytes, 0, length); 332 } 333 334 public void write(DataOutput out, int maxLength) throws IOException { 335 if (length > maxLength) { 336 throw new IOException("data was too long to write! Expected " + 337 "less than or equal to " + maxLength + " bytes, but got " + 338 length + " bytes."); 339 } 340 WritableUtils.writeVInt(out, length); 341 out.write(bytes, 0, length); 342 } 343 344 /** Returns true iff <code>o</code> is a Text with the same contents. */ 345 @Override 346 public boolean equals(Object o) { 347 if (o instanceof Text) 348 return super.equals(o); 349 return false; 350 } 351 352 @Override 353 public int hashCode() { 354 return super.hashCode(); 355 } 356 357 /** A WritableComparator optimized for Text keys. */ 358 public static class Comparator extends WritableComparator { 359 public Comparator() { 360 super(Text.class); 361 } 362 363 @Override 364 public int compare(byte[] b1, int s1, int l1, 365 byte[] b2, int s2, int l2) { 366 int n1 = WritableUtils.decodeVIntSize(b1[s1]); 367 int n2 = WritableUtils.decodeVIntSize(b2[s2]); 368 return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2); 369 } 370 } 371 372 static { 373 // register this comparator 374 WritableComparator.define(Text.class, new Comparator()); 375 } 376 377 /// STATIC UTILITIES FROM HERE DOWN 378 /** 379 * Converts the provided byte array to a String using the 380 * UTF-8 encoding. If the input is malformed, 381 * replace by a default value. 382 */ 383 public static String decode(byte[] utf8) throws CharacterCodingException { 384 return decode(ByteBuffer.wrap(utf8), true); 385 } 386 387 public static String decode(byte[] utf8, int start, int length) 388 throws CharacterCodingException { 389 return decode(ByteBuffer.wrap(utf8, start, length), true); 390 } 391 392 /** 393 * Converts the provided byte array to a String using the 394 * UTF-8 encoding. If <code>replace</code> is true, then 395 * malformed input is replaced with the 396 * substitution character, which is U+FFFD. Otherwise the 397 * method throws a MalformedInputException. 398 */ 399 public static String decode(byte[] utf8, int start, int length, boolean replace) 400 throws CharacterCodingException { 401 return decode(ByteBuffer.wrap(utf8, start, length), replace); 402 } 403 404 private static String decode(ByteBuffer utf8, boolean replace) 405 throws CharacterCodingException { 406 CharsetDecoder decoder = DECODER_FACTORY.get(); 407 if (replace) { 408 decoder.onMalformedInput( 409 java.nio.charset.CodingErrorAction.REPLACE); 410 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 411 } 412 String str = decoder.decode(utf8).toString(); 413 // set decoder back to its default value: REPORT 414 if (replace) { 415 decoder.onMalformedInput(CodingErrorAction.REPORT); 416 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 417 } 418 return str; 419 } 420 421 /** 422 * Converts the provided String to bytes using the 423 * UTF-8 encoding. If the input is malformed, 424 * invalid chars are replaced by a default value. 425 * @return ByteBuffer: bytes stores at ByteBuffer.array() 426 * and length is ByteBuffer.limit() 427 */ 428 429 public static ByteBuffer encode(String string) 430 throws CharacterCodingException { 431 return encode(string, true); 432 } 433 434 /** 435 * Converts the provided String to bytes using the 436 * UTF-8 encoding. If <code>replace</code> is true, then 437 * malformed input is replaced with the 438 * substitution character, which is U+FFFD. Otherwise the 439 * method throws a MalformedInputException. 440 * @return ByteBuffer: bytes stores at ByteBuffer.array() 441 * and length is ByteBuffer.limit() 442 */ 443 public static ByteBuffer encode(String string, boolean replace) 444 throws CharacterCodingException { 445 CharsetEncoder encoder = ENCODER_FACTORY.get(); 446 if (replace) { 447 encoder.onMalformedInput(CodingErrorAction.REPLACE); 448 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 449 } 450 ByteBuffer bytes = 451 encoder.encode(CharBuffer.wrap(string.toCharArray())); 452 if (replace) { 453 encoder.onMalformedInput(CodingErrorAction.REPORT); 454 encoder.onUnmappableCharacter(CodingErrorAction.REPORT); 455 } 456 return bytes; 457 } 458 459 static final public int DEFAULT_MAX_LEN = 1024 * 1024; 460 461 /** Read a UTF8 encoded string from in 462 */ 463 public static String readString(DataInput in) throws IOException { 464 return readString(in, Integer.MAX_VALUE); 465 } 466 467 /** Read a UTF8 encoded string with a maximum size 468 */ 469 public static String readString(DataInput in, int maxLength) 470 throws IOException { 471 int length = WritableUtils.readVIntInRange(in, 0, maxLength); 472 byte [] bytes = new byte[length]; 473 in.readFully(bytes, 0, length); 474 return decode(bytes); 475 } 476 477 /** Write a UTF8 encoded string to out 478 */ 479 public static int writeString(DataOutput out, String s) throws IOException { 480 ByteBuffer bytes = encode(s); 481 int length = bytes.limit(); 482 WritableUtils.writeVInt(out, length); 483 out.write(bytes.array(), 0, length); 484 return length; 485 } 486 487 /** Write a UTF8 encoded string with a maximum size to out 488 */ 489 public static int writeString(DataOutput out, String s, int maxLength) 490 throws IOException { 491 ByteBuffer bytes = encode(s); 492 int length = bytes.limit(); 493 if (length > maxLength) { 494 throw new IOException("string was too long to write! Expected " + 495 "less than or equal to " + maxLength + " bytes, but got " + 496 length + " bytes."); 497 } 498 WritableUtils.writeVInt(out, length); 499 out.write(bytes.array(), 0, length); 500 return length; 501 } 502 503 ////// states for validateUTF8 504 505 private static final int LEAD_BYTE = 0; 506 507 private static final int TRAIL_BYTE_1 = 1; 508 509 private static final int TRAIL_BYTE = 2; 510 511 /** 512 * Check if a byte array contains valid utf-8 513 * @param utf8 byte array 514 * @throws MalformedInputException if the byte array contains invalid utf-8 515 */ 516 public static void validateUTF8(byte[] utf8) throws MalformedInputException { 517 validateUTF8(utf8, 0, utf8.length); 518 } 519 520 /** 521 * Check to see if a byte array is valid utf-8 522 * @param utf8 the array of bytes 523 * @param start the offset of the first byte in the array 524 * @param len the length of the byte sequence 525 * @throws MalformedInputException if the byte array contains invalid bytes 526 */ 527 public static void validateUTF8(byte[] utf8, int start, int len) 528 throws MalformedInputException { 529 int count = start; 530 int leadByte = 0; 531 int length = 0; 532 int state = LEAD_BYTE; 533 while (count < start+len) { 534 int aByte = utf8[count] & 0xFF; 535 536 switch (state) { 537 case LEAD_BYTE: 538 leadByte = aByte; 539 length = bytesFromUTF8[aByte]; 540 541 switch (length) { 542 case 0: // check for ASCII 543 if (leadByte > 0x7F) 544 throw new MalformedInputException(count); 545 break; 546 case 1: 547 if (leadByte < 0xC2 || leadByte > 0xDF) 548 throw new MalformedInputException(count); 549 state = TRAIL_BYTE_1; 550 break; 551 case 2: 552 if (leadByte < 0xE0 || leadByte > 0xEF) 553 throw new MalformedInputException(count); 554 state = TRAIL_BYTE_1; 555 break; 556 case 3: 557 if (leadByte < 0xF0 || leadByte > 0xF4) 558 throw new MalformedInputException(count); 559 state = TRAIL_BYTE_1; 560 break; 561 default: 562 // too long! Longest valid UTF-8 is 4 bytes (lead + three) 563 // or if < 0 we got a trail byte in the lead byte position 564 throw new MalformedInputException(count); 565 } // switch (length) 566 break; 567 568 case TRAIL_BYTE_1: 569 if (leadByte == 0xF0 && aByte < 0x90) 570 throw new MalformedInputException(count); 571 if (leadByte == 0xF4 && aByte > 0x8F) 572 throw new MalformedInputException(count); 573 if (leadByte == 0xE0 && aByte < 0xA0) 574 throw new MalformedInputException(count); 575 if (leadByte == 0xED && aByte > 0x9F) 576 throw new MalformedInputException(count); 577 // falls through to regular trail-byte test!! 578 case TRAIL_BYTE: 579 if (aByte < 0x80 || aByte > 0xBF) 580 throw new MalformedInputException(count); 581 if (--length == 0) { 582 state = LEAD_BYTE; 583 } else { 584 state = TRAIL_BYTE; 585 } 586 break; 587 default: 588 break; 589 } // switch (state) 590 count++; 591 } 592 } 593 594 /** 595 * Magic numbers for UTF-8. These are the number of bytes 596 * that <em>follow</em> a given lead byte. Trailing bytes 597 * have the value -1. The values 4 and 5 are presented in 598 * this table, even though valid UTF-8 cannot include the 599 * five and six byte sequences. 600 */ 601 static final int[] bytesFromUTF8 = 602 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 603 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 604 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 605 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 606 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 607 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 608 0, 0, 0, 0, 0, 0, 0, 609 // trail bytes 610 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 611 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 612 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 613 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 614 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 615 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 616 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; 617 618 /** 619 * Returns the next code point at the current position in 620 * the buffer. The buffer's position will be incremented. 621 * Any mark set on this buffer will be changed by this method! 622 */ 623 public static int bytesToCodePoint(ByteBuffer bytes) { 624 bytes.mark(); 625 byte b = bytes.get(); 626 bytes.reset(); 627 int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; 628 if (extraBytesToRead < 0) return -1; // trailing byte! 629 int ch = 0; 630 631 switch (extraBytesToRead) { 632 case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 633 case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 634 case 3: ch += (bytes.get() & 0xFF); ch <<= 6; 635 case 2: ch += (bytes.get() & 0xFF); ch <<= 6; 636 case 1: ch += (bytes.get() & 0xFF); ch <<= 6; 637 case 0: ch += (bytes.get() & 0xFF); 638 } 639 ch -= offsetsFromUTF8[extraBytesToRead]; 640 641 return ch; 642 } 643 644 645 static final int offsetsFromUTF8[] = 646 { 0x00000000, 0x00003080, 647 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; 648 649 /** 650 * For the given string, returns the number of UTF-8 bytes 651 * required to encode the string. 652 * @param string text to encode 653 * @return number of UTF-8 bytes required to encode 654 */ 655 public static int utf8Length(String string) { 656 CharacterIterator iter = new StringCharacterIterator(string); 657 char ch = iter.first(); 658 int size = 0; 659 while (ch != CharacterIterator.DONE) { 660 if ((ch >= 0xD800) && (ch < 0xDC00)) { 661 // surrogate pair? 662 char trail = iter.next(); 663 if ((trail > 0xDBFF) && (trail < 0xE000)) { 664 // valid pair 665 size += 4; 666 } else { 667 // invalid pair 668 size += 3; 669 iter.previous(); // rewind one 670 } 671 } else if (ch < 0x80) { 672 size++; 673 } else if (ch < 0x800) { 674 size += 2; 675 } else { 676 // ch < 0x10000, that is, the largest char value 677 size += 3; 678 } 679 ch = iter.next(); 680 } 681 return size; 682 } 683}