001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.io; 020 021 import java.io.IOException; 022 import java.io.DataInput; 023 import java.io.DataOutput; 024 import java.nio.ByteBuffer; 025 import java.nio.CharBuffer; 026 import java.nio.charset.CharacterCodingException; 027 import java.nio.charset.Charset; 028 import java.nio.charset.CharsetDecoder; 029 import java.nio.charset.CharsetEncoder; 030 import java.nio.charset.CodingErrorAction; 031 import java.nio.charset.MalformedInputException; 032 import java.text.CharacterIterator; 033 import java.text.StringCharacterIterator; 034 import java.util.Arrays; 035 036 import org.apache.avro.reflect.Stringable; 037 038 import org.apache.hadoop.classification.InterfaceAudience; 039 import org.apache.hadoop.classification.InterfaceStability; 040 041 /** This class stores text using standard UTF8 encoding. It provides methods 042 * to serialize, deserialize, and compare texts at byte level. The type of 043 * length is integer and is serialized using zero-compressed format. <p>In 044 * addition, it provides methods for string traversal without converting the 045 * byte array to a string. <p>Also includes utilities for 046 * serializing/deserialing a string, coding/decoding a string, checking if a 047 * byte array contains valid UTF8 code, calculating the length of an encoded 048 * string. 049 */ 050 @Stringable 051 @InterfaceAudience.Public 052 @InterfaceStability.Stable 053 public class Text extends BinaryComparable 054 implements WritableComparable<BinaryComparable> { 055 056 private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = 057 new ThreadLocal<CharsetEncoder>() { 058 @Override 059 protected CharsetEncoder initialValue() { 060 return Charset.forName("UTF-8").newEncoder(). 061 onMalformedInput(CodingErrorAction.REPORT). 062 onUnmappableCharacter(CodingErrorAction.REPORT); 063 } 064 }; 065 066 private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = 067 new ThreadLocal<CharsetDecoder>() { 068 @Override 069 protected CharsetDecoder initialValue() { 070 return Charset.forName("UTF-8").newDecoder(). 071 onMalformedInput(CodingErrorAction.REPORT). 072 onUnmappableCharacter(CodingErrorAction.REPORT); 073 } 074 }; 075 076 private static final byte [] EMPTY_BYTES = new byte[0]; 077 078 private byte[] bytes; 079 private int length; 080 081 public Text() { 082 bytes = EMPTY_BYTES; 083 } 084 085 /** Construct from a string. 086 */ 087 public Text(String string) { 088 set(string); 089 } 090 091 /** Construct from another text. */ 092 public Text(Text utf8) { 093 set(utf8); 094 } 095 096 /** Construct from a byte array. 097 */ 098 public Text(byte[] utf8) { 099 set(utf8); 100 } 101 102 /** 103 * Get a copy of the bytes that is exactly the length of the data. 104 * See {@link #getBytes()} for faster access to the underlying array. 105 */ 106 public byte[] copyBytes() { 107 byte[] result = new byte[length]; 108 System.arraycopy(bytes, 0, result, 0, length); 109 return result; 110 } 111 112 /** 113 * Returns the raw bytes; however, only data up to {@link #getLength()} is 114 * valid. Please use {@link #copyBytes()} if you 115 * need the returned array to be precisely the length of the data. 116 */ 117 @Override 118 public byte[] getBytes() { 119 return bytes; 120 } 121 122 /** Returns the number of bytes in the byte array */ 123 @Override 124 public int getLength() { 125 return length; 126 } 127 128 /** 129 * Returns the Unicode Scalar Value (32-bit integer value) 130 * for the character at <code>position</code>. Note that this 131 * method avoids using the converter or doing String instantiation 132 * @return the Unicode scalar value at position or -1 133 * if the position is invalid or points to a 134 * trailing byte 135 */ 136 public int charAt(int position) { 137 if (position > this.length) return -1; // too long 138 if (position < 0) return -1; // duh. 139 140 ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position); 141 return bytesToCodePoint(bb.slice()); 142 } 143 144 public int find(String what) { 145 return find(what, 0); 146 } 147 148 /** 149 * Finds any occurence of <code>what</code> in the backing 150 * buffer, starting as position <code>start</code>. The starting 151 * position is measured in bytes and the return value is in 152 * terms of byte position in the buffer. The backing buffer is 153 * not converted to a string for this operation. 154 * @return byte position of the first occurence of the search 155 * string in the UTF-8 buffer or -1 if not found 156 */ 157 public int find(String what, int start) { 158 try { 159 ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length); 160 ByteBuffer tgt = encode(what); 161 byte b = tgt.get(); 162 src.position(start); 163 164 while (src.hasRemaining()) { 165 if (b == src.get()) { // matching first byte 166 src.mark(); // save position in loop 167 tgt.mark(); // save position in target 168 boolean found = true; 169 int pos = src.position()-1; 170 while (tgt.hasRemaining()) { 171 if (!src.hasRemaining()) { // src expired first 172 tgt.reset(); 173 src.reset(); 174 found = false; 175 break; 176 } 177 if (!(tgt.get() == src.get())) { 178 tgt.reset(); 179 src.reset(); 180 found = false; 181 break; // no match 182 } 183 } 184 if (found) return pos; 185 } 186 } 187 return -1; // not found 188 } catch (CharacterCodingException e) { 189 // can't get here 190 e.printStackTrace(); 191 return -1; 192 } 193 } 194 /** Set to contain the contents of a string. 195 */ 196 public void set(String string) { 197 try { 198 ByteBuffer bb = encode(string, true); 199 bytes = bb.array(); 200 length = bb.limit(); 201 }catch(CharacterCodingException e) { 202 throw new RuntimeException("Should not have happened ", e); 203 } 204 } 205 206 /** Set to a utf8 byte array 207 */ 208 public void set(byte[] utf8) { 209 set(utf8, 0, utf8.length); 210 } 211 212 /** copy a text. */ 213 public void set(Text other) { 214 set(other.getBytes(), 0, other.getLength()); 215 } 216 217 /** 218 * Set the Text to range of bytes 219 * @param utf8 the data to copy from 220 * @param start the first position of the new string 221 * @param len the number of bytes of the new string 222 */ 223 public void set(byte[] utf8, int start, int len) { 224 setCapacity(len, false); 225 System.arraycopy(utf8, start, bytes, 0, len); 226 this.length = len; 227 } 228 229 /** 230 * Append a range of bytes to the end of the given text 231 * @param utf8 the data to copy from 232 * @param start the first position to append from utf8 233 * @param len the number of bytes to append 234 */ 235 public void append(byte[] utf8, int start, int len) { 236 setCapacity(length + len, true); 237 System.arraycopy(utf8, start, bytes, length, len); 238 length += len; 239 } 240 241 /** 242 * Clear the string to empty. 243 * 244 * <em>Note</em>: For performance reasons, this call does not clear the 245 * underlying byte array that is retrievable via {@link #getBytes()}. 246 * In order to free the byte-array memory, call {@link #set(byte[])} 247 * with an empty byte array (For example, <code>new byte[0]</code>). 248 */ 249 public void clear() { 250 length = 0; 251 } 252 253 /* 254 * Sets the capacity of this Text object to <em>at least</em> 255 * <code>len</code> bytes. If the current buffer is longer, 256 * then the capacity and existing content of the buffer are 257 * unchanged. If <code>len</code> is larger 258 * than the current capacity, the Text object's capacity is 259 * increased to match. 260 * @param len the number of bytes we need 261 * @param keepData should the old data be kept 262 */ 263 private void setCapacity(int len, boolean keepData) { 264 if (bytes == null || bytes.length < len) { 265 if (bytes != null && keepData) { 266 bytes = Arrays.copyOf(bytes, Math.max(len,length << 1)); 267 } else { 268 bytes = new byte[len]; 269 } 270 } 271 } 272 273 /** 274 * Convert text back to string 275 * @see java.lang.Object#toString() 276 */ 277 @Override 278 public String toString() { 279 try { 280 return decode(bytes, 0, length); 281 } catch (CharacterCodingException e) { 282 throw new RuntimeException("Should not have happened " , e); 283 } 284 } 285 286 /** deserialize 287 */ 288 @Override 289 public void readFields(DataInput in) throws IOException { 290 int newLength = WritableUtils.readVInt(in); 291 setCapacity(newLength, false); 292 in.readFully(bytes, 0, newLength); 293 length = newLength; 294 } 295 296 public void readFields(DataInput in, int maxLength) throws IOException { 297 int newLength = WritableUtils.readVInt(in); 298 if (newLength < 0) { 299 throw new IOException("tried to deserialize " + newLength + 300 " bytes of data! newLength must be non-negative."); 301 } else if (newLength >= maxLength) { 302 throw new IOException("tried to deserialize " + newLength + 303 " bytes of data, but maxLength = " + maxLength); 304 } 305 setCapacity(newLength, false); 306 in.readFully(bytes, 0, newLength); 307 length = newLength; 308 } 309 310 /** Skips over one Text in the input. */ 311 public static void skip(DataInput in) throws IOException { 312 int length = WritableUtils.readVInt(in); 313 WritableUtils.skipFully(in, length); 314 } 315 316 /** serialize 317 * write this object to out 318 * length uses zero-compressed encoding 319 * @see Writable#write(DataOutput) 320 */ 321 @Override 322 public void write(DataOutput out) throws IOException { 323 WritableUtils.writeVInt(out, length); 324 out.write(bytes, 0, length); 325 } 326 327 public void write(DataOutput out, int maxLength) throws IOException { 328 if (length > maxLength) { 329 throw new IOException("data was too long to write! Expected " + 330 "less than or equal to " + maxLength + " bytes, but got " + 331 length + " bytes."); 332 } 333 WritableUtils.writeVInt(out, length); 334 out.write(bytes, 0, length); 335 } 336 337 /** Returns true iff <code>o</code> is a Text with the same contents. */ 338 @Override 339 public boolean equals(Object o) { 340 if (o instanceof Text) 341 return super.equals(o); 342 return false; 343 } 344 345 @Override 346 public int hashCode() { 347 return super.hashCode(); 348 } 349 350 /** A WritableComparator optimized for Text keys. */ 351 public static class Comparator extends WritableComparator { 352 public Comparator() { 353 super(Text.class); 354 } 355 356 @Override 357 public int compare(byte[] b1, int s1, int l1, 358 byte[] b2, int s2, int l2) { 359 int n1 = WritableUtils.decodeVIntSize(b1[s1]); 360 int n2 = WritableUtils.decodeVIntSize(b2[s2]); 361 return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2); 362 } 363 } 364 365 static { 366 // register this comparator 367 WritableComparator.define(Text.class, new Comparator()); 368 } 369 370 /// STATIC UTILITIES FROM HERE DOWN 371 /** 372 * Converts the provided byte array to a String using the 373 * UTF-8 encoding. If the input is malformed, 374 * replace by a default value. 375 */ 376 public static String decode(byte[] utf8) throws CharacterCodingException { 377 return decode(ByteBuffer.wrap(utf8), true); 378 } 379 380 public static String decode(byte[] utf8, int start, int length) 381 throws CharacterCodingException { 382 return decode(ByteBuffer.wrap(utf8, start, length), true); 383 } 384 385 /** 386 * Converts the provided byte array to a String using the 387 * UTF-8 encoding. If <code>replace</code> is true, then 388 * malformed input is replaced with the 389 * substitution character, which is U+FFFD. Otherwise the 390 * method throws a MalformedInputException. 391 */ 392 public static String decode(byte[] utf8, int start, int length, boolean replace) 393 throws CharacterCodingException { 394 return decode(ByteBuffer.wrap(utf8, start, length), replace); 395 } 396 397 private static String decode(ByteBuffer utf8, boolean replace) 398 throws CharacterCodingException { 399 CharsetDecoder decoder = DECODER_FACTORY.get(); 400 if (replace) { 401 decoder.onMalformedInput( 402 java.nio.charset.CodingErrorAction.REPLACE); 403 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 404 } 405 String str = decoder.decode(utf8).toString(); 406 // set decoder back to its default value: REPORT 407 if (replace) { 408 decoder.onMalformedInput(CodingErrorAction.REPORT); 409 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 410 } 411 return str; 412 } 413 414 /** 415 * Converts the provided String to bytes using the 416 * UTF-8 encoding. If the input is malformed, 417 * invalid chars are replaced by a default value. 418 * @return ByteBuffer: bytes stores at ByteBuffer.array() 419 * and length is ByteBuffer.limit() 420 */ 421 422 public static ByteBuffer encode(String string) 423 throws CharacterCodingException { 424 return encode(string, true); 425 } 426 427 /** 428 * Converts the provided String to bytes using the 429 * UTF-8 encoding. If <code>replace</code> is true, then 430 * malformed input is replaced with the 431 * substitution character, which is U+FFFD. Otherwise the 432 * method throws a MalformedInputException. 433 * @return ByteBuffer: bytes stores at ByteBuffer.array() 434 * and length is ByteBuffer.limit() 435 */ 436 public static ByteBuffer encode(String string, boolean replace) 437 throws CharacterCodingException { 438 CharsetEncoder encoder = ENCODER_FACTORY.get(); 439 if (replace) { 440 encoder.onMalformedInput(CodingErrorAction.REPLACE); 441 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 442 } 443 ByteBuffer bytes = 444 encoder.encode(CharBuffer.wrap(string.toCharArray())); 445 if (replace) { 446 encoder.onMalformedInput(CodingErrorAction.REPORT); 447 encoder.onUnmappableCharacter(CodingErrorAction.REPORT); 448 } 449 return bytes; 450 } 451 452 static final public int DEFAULT_MAX_LEN = 1024 * 1024; 453 454 /** Read a UTF8 encoded string from in 455 */ 456 public static String readString(DataInput in) throws IOException { 457 return readString(in, Integer.MAX_VALUE); 458 } 459 460 /** Read a UTF8 encoded string with a maximum size 461 */ 462 public static String readString(DataInput in, int maxLength) 463 throws IOException { 464 int length = WritableUtils.readVIntInRange(in, 0, maxLength); 465 byte [] bytes = new byte[length]; 466 in.readFully(bytes, 0, length); 467 return decode(bytes); 468 } 469 470 /** Write a UTF8 encoded string to out 471 */ 472 public static int writeString(DataOutput out, String s) throws IOException { 473 ByteBuffer bytes = encode(s); 474 int length = bytes.limit(); 475 WritableUtils.writeVInt(out, length); 476 out.write(bytes.array(), 0, length); 477 return length; 478 } 479 480 /** Write a UTF8 encoded string with a maximum size to out 481 */ 482 public static int writeString(DataOutput out, String s, int maxLength) 483 throws IOException { 484 ByteBuffer bytes = encode(s); 485 int length = bytes.limit(); 486 if (length > maxLength) { 487 throw new IOException("string was too long to write! Expected " + 488 "less than or equal to " + maxLength + " bytes, but got " + 489 length + " bytes."); 490 } 491 WritableUtils.writeVInt(out, length); 492 out.write(bytes.array(), 0, length); 493 return length; 494 } 495 496 ////// states for validateUTF8 497 498 private static final int LEAD_BYTE = 0; 499 500 private static final int TRAIL_BYTE_1 = 1; 501 502 private static final int TRAIL_BYTE = 2; 503 504 /** 505 * Check if a byte array contains valid utf-8 506 * @param utf8 byte array 507 * @throws MalformedInputException if the byte array contains invalid utf-8 508 */ 509 public static void validateUTF8(byte[] utf8) throws MalformedInputException { 510 validateUTF8(utf8, 0, utf8.length); 511 } 512 513 /** 514 * Check to see if a byte array is valid utf-8 515 * @param utf8 the array of bytes 516 * @param start the offset of the first byte in the array 517 * @param len the length of the byte sequence 518 * @throws MalformedInputException if the byte array contains invalid bytes 519 */ 520 public static void validateUTF8(byte[] utf8, int start, int len) 521 throws MalformedInputException { 522 int count = start; 523 int leadByte = 0; 524 int length = 0; 525 int state = LEAD_BYTE; 526 while (count < start+len) { 527 int aByte = utf8[count] & 0xFF; 528 529 switch (state) { 530 case LEAD_BYTE: 531 leadByte = aByte; 532 length = bytesFromUTF8[aByte]; 533 534 switch (length) { 535 case 0: // check for ASCII 536 if (leadByte > 0x7F) 537 throw new MalformedInputException(count); 538 break; 539 case 1: 540 if (leadByte < 0xC2 || leadByte > 0xDF) 541 throw new MalformedInputException(count); 542 state = TRAIL_BYTE_1; 543 break; 544 case 2: 545 if (leadByte < 0xE0 || leadByte > 0xEF) 546 throw new MalformedInputException(count); 547 state = TRAIL_BYTE_1; 548 break; 549 case 3: 550 if (leadByte < 0xF0 || leadByte > 0xF4) 551 throw new MalformedInputException(count); 552 state = TRAIL_BYTE_1; 553 break; 554 default: 555 // too long! Longest valid UTF-8 is 4 bytes (lead + three) 556 // or if < 0 we got a trail byte in the lead byte position 557 throw new MalformedInputException(count); 558 } // switch (length) 559 break; 560 561 case TRAIL_BYTE_1: 562 if (leadByte == 0xF0 && aByte < 0x90) 563 throw new MalformedInputException(count); 564 if (leadByte == 0xF4 && aByte > 0x8F) 565 throw new MalformedInputException(count); 566 if (leadByte == 0xE0 && aByte < 0xA0) 567 throw new MalformedInputException(count); 568 if (leadByte == 0xED && aByte > 0x9F) 569 throw new MalformedInputException(count); 570 // falls through to regular trail-byte test!! 571 case TRAIL_BYTE: 572 if (aByte < 0x80 || aByte > 0xBF) 573 throw new MalformedInputException(count); 574 if (--length == 0) { 575 state = LEAD_BYTE; 576 } else { 577 state = TRAIL_BYTE; 578 } 579 break; 580 } // switch (state) 581 count++; 582 } 583 } 584 585 /** 586 * Magic numbers for UTF-8. These are the number of bytes 587 * that <em>follow</em> a given lead byte. Trailing bytes 588 * have the value -1. The values 4 and 5 are presented in 589 * this table, even though valid UTF-8 cannot include the 590 * five and six byte sequences. 591 */ 592 static final int[] bytesFromUTF8 = 593 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 594 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 595 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 596 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 597 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 598 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 599 0, 0, 0, 0, 0, 0, 0, 600 // trail bytes 601 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 602 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 603 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 604 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 605 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 606 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 607 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; 608 609 /** 610 * Returns the next code point at the current position in 611 * the buffer. The buffer's position will be incremented. 612 * Any mark set on this buffer will be changed by this method! 613 */ 614 public static int bytesToCodePoint(ByteBuffer bytes) { 615 bytes.mark(); 616 byte b = bytes.get(); 617 bytes.reset(); 618 int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; 619 if (extraBytesToRead < 0) return -1; // trailing byte! 620 int ch = 0; 621 622 switch (extraBytesToRead) { 623 case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 624 case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 625 case 3: ch += (bytes.get() & 0xFF); ch <<= 6; 626 case 2: ch += (bytes.get() & 0xFF); ch <<= 6; 627 case 1: ch += (bytes.get() & 0xFF); ch <<= 6; 628 case 0: ch += (bytes.get() & 0xFF); 629 } 630 ch -= offsetsFromUTF8[extraBytesToRead]; 631 632 return ch; 633 } 634 635 636 static final int offsetsFromUTF8[] = 637 { 0x00000000, 0x00003080, 638 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; 639 640 /** 641 * For the given string, returns the number of UTF-8 bytes 642 * required to encode the string. 643 * @param string text to encode 644 * @return number of UTF-8 bytes required to encode 645 */ 646 public static int utf8Length(String string) { 647 CharacterIterator iter = new StringCharacterIterator(string); 648 char ch = iter.first(); 649 int size = 0; 650 while (ch != CharacterIterator.DONE) { 651 if ((ch >= 0xD800) && (ch < 0xDC00)) { 652 // surrogate pair? 653 char trail = iter.next(); 654 if ((trail > 0xDBFF) && (trail < 0xE000)) { 655 // valid pair 656 size += 4; 657 } else { 658 // invalid pair 659 size += 3; 660 iter.previous(); // rewind one 661 } 662 } else if (ch < 0x80) { 663 size++; 664 } else if (ch < 0x800) { 665 size += 2; 666 } else { 667 // ch < 0x10000, that is, the largest char value 668 size += 3; 669 } 670 ch = iter.next(); 671 } 672 return size; 673 } 674 }