001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.io; 020 021import java.io.IOException; 022import java.io.DataInput; 023import java.io.DataOutput; 024import java.nio.ByteBuffer; 025import java.nio.CharBuffer; 026import java.nio.charset.CharacterCodingException; 027import java.nio.charset.Charset; 028import java.nio.charset.CharsetDecoder; 029import java.nio.charset.CharsetEncoder; 030import java.nio.charset.CodingErrorAction; 031import java.nio.charset.MalformedInputException; 032import java.text.CharacterIterator; 033import java.text.StringCharacterIterator; 034import java.util.Arrays; 035 036import org.apache.avro.reflect.Stringable; 037 038import org.apache.hadoop.classification.InterfaceAudience; 039import org.apache.hadoop.classification.InterfaceStability; 040 041/** This class stores text using standard UTF8 encoding. It provides methods 042 * to serialize, deserialize, and compare texts at byte level. The type of 043 * length is integer and is serialized using zero-compressed format. <p>In 044 * addition, it provides methods for string traversal without converting the 045 * byte array to a string. <p>Also includes utilities for 046 * serializing/deserialing a string, coding/decoding a string, checking if a 047 * byte array contains valid UTF8 code, calculating the length of an encoded 048 * string. 049 */ 050@Stringable 051@InterfaceAudience.Public 052@InterfaceStability.Stable 053public class Text extends BinaryComparable 054 implements WritableComparable<BinaryComparable> { 055 056 private static ThreadLocal<CharsetEncoder> ENCODER_FACTORY = 057 new ThreadLocal<CharsetEncoder>() { 058 protected CharsetEncoder initialValue() { 059 return Charset.forName("UTF-8").newEncoder(). 060 onMalformedInput(CodingErrorAction.REPORT). 061 onUnmappableCharacter(CodingErrorAction.REPORT); 062 } 063 }; 064 065 private static ThreadLocal<CharsetDecoder> DECODER_FACTORY = 066 new ThreadLocal<CharsetDecoder>() { 067 protected CharsetDecoder initialValue() { 068 return Charset.forName("UTF-8").newDecoder(). 069 onMalformedInput(CodingErrorAction.REPORT). 070 onUnmappableCharacter(CodingErrorAction.REPORT); 071 } 072 }; 073 074 private static final byte [] EMPTY_BYTES = new byte[0]; 075 076 private byte[] bytes; 077 private int length; 078 079 public Text() { 080 bytes = EMPTY_BYTES; 081 } 082 083 /** Construct from a string. 084 */ 085 public Text(String string) { 086 set(string); 087 } 088 089 /** Construct from another text. */ 090 public Text(Text utf8) { 091 set(utf8); 092 } 093 094 /** Construct from a byte array. 095 */ 096 public Text(byte[] utf8) { 097 set(utf8); 098 } 099 100 /** 101 * Get a copy of the bytes that is exactly the length of the data. 102 * See {@link #getBytes()} for faster access to the underlying array. 103 */ 104 public byte[] copyBytes() { 105 byte[] result = new byte[length]; 106 System.arraycopy(bytes, 0, result, 0, length); 107 return result; 108 } 109 110 /** 111 * Returns the raw bytes; however, only data up to {@link #getLength()} is 112 * valid. Please use {@link #copyBytes()} if you 113 * need the returned array to be precisely the length of the data. 114 */ 115 public byte[] getBytes() { 116 return bytes; 117 } 118 119 /** Returns the number of bytes in the byte array */ 120 public int getLength() { 121 return length; 122 } 123 124 /** 125 * Returns the Unicode Scalar Value (32-bit integer value) 126 * for the character at <code>position</code>. Note that this 127 * method avoids using the converter or doing String instatiation 128 * @return the Unicode scalar value at position or -1 129 * if the position is invalid or points to a 130 * trailing byte 131 */ 132 public int charAt(int position) { 133 if (position > this.length) return -1; // too long 134 if (position < 0) return -1; // duh. 135 136 ByteBuffer bb = (ByteBuffer)ByteBuffer.wrap(bytes).position(position); 137 return bytesToCodePoint(bb.slice()); 138 } 139 140 public int find(String what) { 141 return find(what, 0); 142 } 143 144 /** 145 * Finds any occurence of <code>what</code> in the backing 146 * buffer, starting as position <code>start</code>. The starting 147 * position is measured in bytes and the return value is in 148 * terms of byte position in the buffer. The backing buffer is 149 * not converted to a string for this operation. 150 * @return byte position of the first occurence of the search 151 * string in the UTF-8 buffer or -1 if not found 152 */ 153 public int find(String what, int start) { 154 try { 155 ByteBuffer src = ByteBuffer.wrap(this.bytes,0,this.length); 156 ByteBuffer tgt = encode(what); 157 byte b = tgt.get(); 158 src.position(start); 159 160 while (src.hasRemaining()) { 161 if (b == src.get()) { // matching first byte 162 src.mark(); // save position in loop 163 tgt.mark(); // save position in target 164 boolean found = true; 165 int pos = src.position()-1; 166 while (tgt.hasRemaining()) { 167 if (!src.hasRemaining()) { // src expired first 168 tgt.reset(); 169 src.reset(); 170 found = false; 171 break; 172 } 173 if (!(tgt.get() == src.get())) { 174 tgt.reset(); 175 src.reset(); 176 found = false; 177 break; // no match 178 } 179 } 180 if (found) return pos; 181 } 182 } 183 return -1; // not found 184 } catch (CharacterCodingException e) { 185 // can't get here 186 e.printStackTrace(); 187 return -1; 188 } 189 } 190 /** Set to contain the contents of a string. 191 */ 192 public void set(String string) { 193 try { 194 ByteBuffer bb = encode(string, true); 195 bytes = bb.array(); 196 length = bb.limit(); 197 }catch(CharacterCodingException e) { 198 throw new RuntimeException("Should not have happened ", e); 199 } 200 } 201 202 /** Set to a utf8 byte array 203 */ 204 public void set(byte[] utf8) { 205 set(utf8, 0, utf8.length); 206 } 207 208 /** copy a text. */ 209 public void set(Text other) { 210 set(other.getBytes(), 0, other.getLength()); 211 } 212 213 /** 214 * Set the Text to range of bytes 215 * @param utf8 the data to copy from 216 * @param start the first position of the new string 217 * @param len the number of bytes of the new string 218 */ 219 public void set(byte[] utf8, int start, int len) { 220 setCapacity(len, false); 221 System.arraycopy(utf8, start, bytes, 0, len); 222 this.length = len; 223 } 224 225 /** 226 * Append a range of bytes to the end of the given text 227 * @param utf8 the data to copy from 228 * @param start the first position to append from utf8 229 * @param len the number of bytes to append 230 */ 231 public void append(byte[] utf8, int start, int len) { 232 setCapacity(length + len, true); 233 System.arraycopy(utf8, start, bytes, length, len); 234 length += len; 235 } 236 237 /** 238 * Clear the string to empty. 239 */ 240 public void clear() { 241 length = 0; 242 } 243 244 /* 245 * Sets the capacity of this Text object to <em>at least</em> 246 * <code>len</code> bytes. If the current buffer is longer, 247 * then the capacity and existing content of the buffer are 248 * unchanged. If <code>len</code> is larger 249 * than the current capacity, the Text object's capacity is 250 * increased to match. 251 * @param len the number of bytes we need 252 * @param keepData should the old data be kept 253 */ 254 private void setCapacity(int len, boolean keepData) { 255 if (bytes == null || bytes.length < len) { 256 if (bytes != null && keepData) { 257 bytes = Arrays.copyOf(bytes, Math.max(len,length << 1)); 258 } else { 259 bytes = new byte[len]; 260 } 261 } 262 } 263 264 /** 265 * Convert text back to string 266 * @see java.lang.Object#toString() 267 */ 268 @Override 269 public String toString() { 270 try { 271 return decode(bytes, 0, length); 272 } catch (CharacterCodingException e) { 273 throw new RuntimeException("Should not have happened " , e); 274 } 275 } 276 277 /** deserialize 278 */ 279 public void readFields(DataInput in) throws IOException { 280 int newLength = WritableUtils.readVInt(in); 281 setCapacity(newLength, false); 282 in.readFully(bytes, 0, newLength); 283 length = newLength; 284 } 285 286 /** Skips over one Text in the input. */ 287 public static void skip(DataInput in) throws IOException { 288 int length = WritableUtils.readVInt(in); 289 WritableUtils.skipFully(in, length); 290 } 291 292 /** serialize 293 * write this object to out 294 * length uses zero-compressed encoding 295 * @see Writable#write(DataOutput) 296 */ 297 public void write(DataOutput out) throws IOException { 298 WritableUtils.writeVInt(out, length); 299 out.write(bytes, 0, length); 300 } 301 302 /** Returns true iff <code>o</code> is a Text with the same contents. */ 303 public boolean equals(Object o) { 304 if (o instanceof Text) 305 return super.equals(o); 306 return false; 307 } 308 309 @Override 310 public int hashCode() { 311 return super.hashCode(); 312 } 313 314 /** A WritableComparator optimized for Text keys. */ 315 public static class Comparator extends WritableComparator { 316 public Comparator() { 317 super(Text.class); 318 } 319 320 public int compare(byte[] b1, int s1, int l1, 321 byte[] b2, int s2, int l2) { 322 int n1 = WritableUtils.decodeVIntSize(b1[s1]); 323 int n2 = WritableUtils.decodeVIntSize(b2[s2]); 324 return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2); 325 } 326 } 327 328 static { 329 // register this comparator 330 WritableComparator.define(Text.class, new Comparator()); 331 } 332 333 /// STATIC UTILITIES FROM HERE DOWN 334 /** 335 * Converts the provided byte array to a String using the 336 * UTF-8 encoding. If the input is malformed, 337 * replace by a default value. 338 */ 339 public static String decode(byte[] utf8) throws CharacterCodingException { 340 return decode(ByteBuffer.wrap(utf8), true); 341 } 342 343 public static String decode(byte[] utf8, int start, int length) 344 throws CharacterCodingException { 345 return decode(ByteBuffer.wrap(utf8, start, length), true); 346 } 347 348 /** 349 * Converts the provided byte array to a String using the 350 * UTF-8 encoding. If <code>replace</code> is true, then 351 * malformed input is replaced with the 352 * substitution character, which is U+FFFD. Otherwise the 353 * method throws a MalformedInputException. 354 */ 355 public static String decode(byte[] utf8, int start, int length, boolean replace) 356 throws CharacterCodingException { 357 return decode(ByteBuffer.wrap(utf8, start, length), replace); 358 } 359 360 private static String decode(ByteBuffer utf8, boolean replace) 361 throws CharacterCodingException { 362 CharsetDecoder decoder = DECODER_FACTORY.get(); 363 if (replace) { 364 decoder.onMalformedInput( 365 java.nio.charset.CodingErrorAction.REPLACE); 366 decoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 367 } 368 String str = decoder.decode(utf8).toString(); 369 // set decoder back to its default value: REPORT 370 if (replace) { 371 decoder.onMalformedInput(CodingErrorAction.REPORT); 372 decoder.onUnmappableCharacter(CodingErrorAction.REPORT); 373 } 374 return str; 375 } 376 377 /** 378 * Converts the provided String to bytes using the 379 * UTF-8 encoding. If the input is malformed, 380 * invalid chars are replaced by a default value. 381 * @return ByteBuffer: bytes stores at ByteBuffer.array() 382 * and length is ByteBuffer.limit() 383 */ 384 385 public static ByteBuffer encode(String string) 386 throws CharacterCodingException { 387 return encode(string, true); 388 } 389 390 /** 391 * Converts the provided String to bytes using the 392 * UTF-8 encoding. If <code>replace</code> is true, then 393 * malformed input is replaced with the 394 * substitution character, which is U+FFFD. Otherwise the 395 * method throws a MalformedInputException. 396 * @return ByteBuffer: bytes stores at ByteBuffer.array() 397 * and length is ByteBuffer.limit() 398 */ 399 public static ByteBuffer encode(String string, boolean replace) 400 throws CharacterCodingException { 401 CharsetEncoder encoder = ENCODER_FACTORY.get(); 402 if (replace) { 403 encoder.onMalformedInput(CodingErrorAction.REPLACE); 404 encoder.onUnmappableCharacter(CodingErrorAction.REPLACE); 405 } 406 ByteBuffer bytes = 407 encoder.encode(CharBuffer.wrap(string.toCharArray())); 408 if (replace) { 409 encoder.onMalformedInput(CodingErrorAction.REPORT); 410 encoder.onUnmappableCharacter(CodingErrorAction.REPORT); 411 } 412 return bytes; 413 } 414 415 /** Read a UTF8 encoded string from in 416 */ 417 public static String readString(DataInput in) throws IOException { 418 int length = WritableUtils.readVInt(in); 419 byte [] bytes = new byte[length]; 420 in.readFully(bytes, 0, length); 421 return decode(bytes); 422 } 423 424 /** Write a UTF8 encoded string to out 425 */ 426 public static int writeString(DataOutput out, String s) throws IOException { 427 ByteBuffer bytes = encode(s); 428 int length = bytes.limit(); 429 WritableUtils.writeVInt(out, length); 430 out.write(bytes.array(), 0, length); 431 return length; 432 } 433 434 ////// states for validateUTF8 435 436 private static final int LEAD_BYTE = 0; 437 438 private static final int TRAIL_BYTE_1 = 1; 439 440 private static final int TRAIL_BYTE = 2; 441 442 /** 443 * Check if a byte array contains valid utf-8 444 * @param utf8 byte array 445 * @throws MalformedInputException if the byte array contains invalid utf-8 446 */ 447 public static void validateUTF8(byte[] utf8) throws MalformedInputException { 448 validateUTF8(utf8, 0, utf8.length); 449 } 450 451 /** 452 * Check to see if a byte array is valid utf-8 453 * @param utf8 the array of bytes 454 * @param start the offset of the first byte in the array 455 * @param len the length of the byte sequence 456 * @throws MalformedInputException if the byte array contains invalid bytes 457 */ 458 public static void validateUTF8(byte[] utf8, int start, int len) 459 throws MalformedInputException { 460 int count = start; 461 int leadByte = 0; 462 int length = 0; 463 int state = LEAD_BYTE; 464 while (count < start+len) { 465 int aByte = ((int) utf8[count] & 0xFF); 466 467 switch (state) { 468 case LEAD_BYTE: 469 leadByte = aByte; 470 length = bytesFromUTF8[aByte]; 471 472 switch (length) { 473 case 0: // check for ASCII 474 if (leadByte > 0x7F) 475 throw new MalformedInputException(count); 476 break; 477 case 1: 478 if (leadByte < 0xC2 || leadByte > 0xDF) 479 throw new MalformedInputException(count); 480 state = TRAIL_BYTE_1; 481 break; 482 case 2: 483 if (leadByte < 0xE0 || leadByte > 0xEF) 484 throw new MalformedInputException(count); 485 state = TRAIL_BYTE_1; 486 break; 487 case 3: 488 if (leadByte < 0xF0 || leadByte > 0xF4) 489 throw new MalformedInputException(count); 490 state = TRAIL_BYTE_1; 491 break; 492 default: 493 // too long! Longest valid UTF-8 is 4 bytes (lead + three) 494 // or if < 0 we got a trail byte in the lead byte position 495 throw new MalformedInputException(count); 496 } // switch (length) 497 break; 498 499 case TRAIL_BYTE_1: 500 if (leadByte == 0xF0 && aByte < 0x90) 501 throw new MalformedInputException(count); 502 if (leadByte == 0xF4 && aByte > 0x8F) 503 throw new MalformedInputException(count); 504 if (leadByte == 0xE0 && aByte < 0xA0) 505 throw new MalformedInputException(count); 506 if (leadByte == 0xED && aByte > 0x9F) 507 throw new MalformedInputException(count); 508 // falls through to regular trail-byte test!! 509 case TRAIL_BYTE: 510 if (aByte < 0x80 || aByte > 0xBF) 511 throw new MalformedInputException(count); 512 if (--length == 0) { 513 state = LEAD_BYTE; 514 } else { 515 state = TRAIL_BYTE; 516 } 517 break; 518 } // switch (state) 519 count++; 520 } 521 } 522 523 /** 524 * Magic numbers for UTF-8. These are the number of bytes 525 * that <em>follow</em> a given lead byte. Trailing bytes 526 * have the value -1. The values 4 and 5 are presented in 527 * this table, even though valid UTF-8 cannot include the 528 * five and six byte sequences. 529 */ 530 static final int[] bytesFromUTF8 = 531 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 532 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 533 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 534 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 535 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 536 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 537 0, 0, 0, 0, 0, 0, 0, 538 // trail bytes 539 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 540 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 541 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 542 -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 1, 543 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 544 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 545 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 }; 546 547 /** 548 * Returns the next code point at the current position in 549 * the buffer. The buffer's position will be incremented. 550 * Any mark set on this buffer will be changed by this method! 551 */ 552 public static int bytesToCodePoint(ByteBuffer bytes) { 553 bytes.mark(); 554 byte b = bytes.get(); 555 bytes.reset(); 556 int extraBytesToRead = bytesFromUTF8[(b & 0xFF)]; 557 if (extraBytesToRead < 0) return -1; // trailing byte! 558 int ch = 0; 559 560 switch (extraBytesToRead) { 561 case 5: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 562 case 4: ch += (bytes.get() & 0xFF); ch <<= 6; /* remember, illegal UTF-8 */ 563 case 3: ch += (bytes.get() & 0xFF); ch <<= 6; 564 case 2: ch += (bytes.get() & 0xFF); ch <<= 6; 565 case 1: ch += (bytes.get() & 0xFF); ch <<= 6; 566 case 0: ch += (bytes.get() & 0xFF); 567 } 568 ch -= offsetsFromUTF8[extraBytesToRead]; 569 570 return ch; 571 } 572 573 574 static final int offsetsFromUTF8[] = 575 { 0x00000000, 0x00003080, 576 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 }; 577 578 /** 579 * For the given string, returns the number of UTF-8 bytes 580 * required to encode the string. 581 * @param string text to encode 582 * @return number of UTF-8 bytes required to encode 583 */ 584 public static int utf8Length(String string) { 585 CharacterIterator iter = new StringCharacterIterator(string); 586 char ch = iter.first(); 587 int size = 0; 588 while (ch != CharacterIterator.DONE) { 589 if ((ch >= 0xD800) && (ch < 0xDC00)) { 590 // surrogate pair? 591 char trail = iter.next(); 592 if ((trail > 0xDBFF) && (trail < 0xE000)) { 593 // valid pair 594 size += 4; 595 } else { 596 // invalid pair 597 size += 3; 598 iter.previous(); // rewind one 599 } 600 } else if (ch < 0x80) { 601 size++; 602 } else if (ch < 0x800) { 603 size += 2; 604 } else { 605 // ch < 0x10000, that is, the largest char value 606 size += 3; 607 } 608 ch = iter.next(); 609 } 610 return size; 611 } 612}