001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.io; 020 021import java.io.IOException; 022import java.io.DataInput; 023import java.io.DataOutput; 024import java.io.UTFDataFormatException; 025 026import org.apache.hadoop.util.StringUtils; 027 028import org.apache.commons.logging.*; 029import org.apache.hadoop.classification.InterfaceAudience; 030import org.apache.hadoop.classification.InterfaceStability; 031 032/** A WritableComparable for strings that uses the UTF8 encoding. 033 * 034 * <p>Also includes utilities for efficiently reading and writing UTF-8. 035 * 036 * Note that this decodes UTF-8 but actually encodes CESU-8, a variant of 037 * UTF-8: see http://en.wikipedia.org/wiki/CESU-8 038 * 039 * @deprecated replaced by Text 040 */ 041@Deprecated 042@InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) 043@InterfaceStability.Stable 044public class UTF8 implements WritableComparable<UTF8> { 045 private static final Log LOG= LogFactory.getLog(UTF8.class); 046 private static final DataInputBuffer IBUF = new DataInputBuffer(); 047 048 private static final ThreadLocal<DataOutputBuffer> OBUF_FACTORY = 049 new ThreadLocal<DataOutputBuffer>(){ 050 @Override 051 protected DataOutputBuffer initialValue() { 052 return new DataOutputBuffer(); 053 } 054 }; 055 056 private static final byte[] EMPTY_BYTES = new byte[0]; 057 058 private byte[] bytes = EMPTY_BYTES; 059 private int length; 060 061 public UTF8() { 062 //set(""); 063 } 064 065 /** Construct from a given string. */ 066 public UTF8(String string) { 067 set(string); 068 } 069 070 /** Construct from a given string. */ 071 public UTF8(UTF8 utf8) { 072 set(utf8); 073 } 074 075 /** The raw bytes. */ 076 public byte[] getBytes() { 077 return bytes; 078 } 079 080 /** The number of bytes in the encoded string. */ 081 public int getLength() { 082 return length; 083 } 084 085 /** Set to contain the contents of a string. */ 086 public void set(String string) { 087 if (string.length() > 0xffff/3) { // maybe too long 088 LOG.warn("truncating long string: " + string.length() 089 + " chars, starting with " + string.substring(0, 20)); 090 string = string.substring(0, 0xffff/3); 091 } 092 093 length = utf8Length(string); // compute length 094 if (length > 0xffff) // double-check length 095 throw new RuntimeException("string too long!"); 096 097 if (bytes == null || length > bytes.length) // grow buffer 098 bytes = new byte[length]; 099 100 try { // avoid sync'd allocations 101 DataOutputBuffer obuf = OBUF_FACTORY.get(); 102 obuf.reset(); 103 writeChars(obuf, string, 0, string.length()); 104 System.arraycopy(obuf.getData(), 0, bytes, 0, length); 105 } catch (IOException e) { 106 throw new RuntimeException(e); 107 } 108 } 109 110 /** Set to contain the contents of a string. */ 111 public void set(UTF8 other) { 112 length = other.length; 113 if (bytes == null || length > bytes.length) // grow buffer 114 bytes = new byte[length]; 115 System.arraycopy(other.bytes, 0, bytes, 0, length); 116 } 117 118 @Override 119 public void readFields(DataInput in) throws IOException { 120 length = in.readUnsignedShort(); 121 if (bytes == null || bytes.length < length) 122 bytes = new byte[length]; 123 in.readFully(bytes, 0, length); 124 } 125 126 /** Skips over one UTF8 in the input. */ 127 public static void skip(DataInput in) throws IOException { 128 int length = in.readUnsignedShort(); 129 WritableUtils.skipFully(in, length); 130 } 131 132 @Override 133 public void write(DataOutput out) throws IOException { 134 out.writeShort(length); 135 out.write(bytes, 0, length); 136 } 137 138 /** Compare two UTF8s. */ 139 @Override 140 public int compareTo(UTF8 o) { 141 return WritableComparator.compareBytes(bytes, 0, length, 142 o.bytes, 0, o.length); 143 } 144 145 /** Convert to a String. */ 146 @Override 147 public String toString() { 148 StringBuilder buffer = new StringBuilder(length); 149 try { 150 synchronized (IBUF) { 151 IBUF.reset(bytes, length); 152 readChars(IBUF, buffer, length); 153 } 154 } catch (IOException e) { 155 throw new RuntimeException(e); 156 } 157 return buffer.toString(); 158 } 159 160 /** 161 * Convert to a string, checking for valid UTF8. 162 * @return the converted string 163 * @throws UTFDataFormatException if the underlying bytes contain invalid 164 * UTF8 data. 165 */ 166 public String toStringChecked() throws IOException { 167 StringBuilder buffer = new StringBuilder(length); 168 synchronized (IBUF) { 169 IBUF.reset(bytes, length); 170 readChars(IBUF, buffer, length); 171 } 172 return buffer.toString(); 173 } 174 175 /** Returns true iff <code>o</code> is a UTF8 with the same contents. */ 176 @Override 177 public boolean equals(Object o) { 178 if (!(o instanceof UTF8)) 179 return false; 180 UTF8 that = (UTF8)o; 181 if (this.length != that.length) 182 return false; 183 else 184 return WritableComparator.compareBytes(bytes, 0, length, 185 that.bytes, 0, that.length) == 0; 186 } 187 188 @Override 189 public int hashCode() { 190 return WritableComparator.hashBytes(bytes, length); 191 } 192 193 /** A WritableComparator optimized for UTF8 keys. */ 194 public static class Comparator extends WritableComparator { 195 public Comparator() { 196 super(UTF8.class); 197 } 198 199 @Override 200 public int compare(byte[] b1, int s1, int l1, 201 byte[] b2, int s2, int l2) { 202 int n1 = readUnsignedShort(b1, s1); 203 int n2 = readUnsignedShort(b2, s2); 204 return compareBytes(b1, s1+2, n1, b2, s2+2, n2); 205 } 206 } 207 208 static { // register this comparator 209 WritableComparator.define(UTF8.class, new Comparator()); 210 } 211 212 /// STATIC UTILITIES FROM HERE DOWN 213 214 /// These are probably not used much anymore, and might be removed... 215 216 /** Convert a string to a UTF-8 encoded byte array. 217 * @see String#getBytes(String) 218 */ 219 public static byte[] getBytes(String string) { 220 byte[] result = new byte[utf8Length(string)]; 221 try { // avoid sync'd allocations 222 DataOutputBuffer obuf = OBUF_FACTORY.get(); 223 obuf.reset(); 224 writeChars(obuf, string, 0, string.length()); 225 System.arraycopy(obuf.getData(), 0, result, 0, obuf.getLength()); 226 } catch (IOException e) { 227 throw new RuntimeException(e); 228 } 229 return result; 230 } 231 232 /** 233 * Convert a UTF-8 encoded byte array back into a string. 234 * 235 * @throws IOException if the byte array is invalid UTF8 236 */ 237 public static String fromBytes(byte[] bytes) throws IOException { 238 DataInputBuffer dbuf = new DataInputBuffer(); 239 dbuf.reset(bytes, 0, bytes.length); 240 StringBuilder buf = new StringBuilder(bytes.length); 241 readChars(dbuf, buf, bytes.length); 242 return buf.toString(); 243 } 244 245 /** Read a UTF-8 encoded string. 246 * 247 * @see DataInput#readUTF() 248 */ 249 public static String readString(DataInput in) throws IOException { 250 int bytes = in.readUnsignedShort(); 251 StringBuilder buffer = new StringBuilder(bytes); 252 readChars(in, buffer, bytes); 253 return buffer.toString(); 254 } 255 256 private static void readChars(DataInput in, StringBuilder buffer, int nBytes) 257 throws UTFDataFormatException, IOException { 258 DataOutputBuffer obuf = OBUF_FACTORY.get(); 259 obuf.reset(); 260 obuf.write(in, nBytes); 261 byte[] bytes = obuf.getData(); 262 int i = 0; 263 while (i < nBytes) { 264 byte b = bytes[i++]; 265 if ((b & 0x80) == 0) { 266 // 0b0xxxxxxx: 1-byte sequence 267 buffer.append((char)(b & 0x7F)); 268 } else if ((b & 0xE0) == 0xC0) { 269 if (i >= nBytes) { 270 throw new UTFDataFormatException("Truncated UTF8 at " + 271 StringUtils.byteToHexString(bytes, i - 1, 1)); 272 } 273 // 0b110xxxxx: 2-byte sequence 274 buffer.append((char)(((b & 0x1F) << 6) 275 | (bytes[i++] & 0x3F))); 276 } else if ((b & 0xF0) == 0xE0) { 277 // 0b1110xxxx: 3-byte sequence 278 if (i + 1 >= nBytes) { 279 throw new UTFDataFormatException("Truncated UTF8 at " + 280 StringUtils.byteToHexString(bytes, i - 1, 2)); 281 } 282 buffer.append((char)(((b & 0x0F) << 12) 283 | ((bytes[i++] & 0x3F) << 6) 284 | (bytes[i++] & 0x3F))); 285 } else if ((b & 0xF8) == 0xF0) { 286 if (i + 2 >= nBytes) { 287 throw new UTFDataFormatException("Truncated UTF8 at " + 288 StringUtils.byteToHexString(bytes, i - 1, 3)); 289 } 290 // 0b11110xxx: 4-byte sequence 291 int codepoint = 292 ((b & 0x07) << 18) 293 | ((bytes[i++] & 0x3F) << 12) 294 | ((bytes[i++] & 0x3F) << 6) 295 | ((bytes[i++] & 0x3F)); 296 buffer.append(highSurrogate(codepoint)) 297 .append(lowSurrogate(codepoint)); 298 } else { 299 // The UTF8 standard describes 5-byte and 6-byte sequences, but 300 // these are no longer allowed as of 2003 (see RFC 3629) 301 302 // Only show the next 6 bytes max in the error code - in case the 303 // buffer is large, this will prevent an exceedingly large message. 304 int endForError = Math.min(i + 5, nBytes); 305 throw new UTFDataFormatException("Invalid UTF8 at " + 306 StringUtils.byteToHexString(bytes, i - 1, endForError)); 307 } 308 } 309 } 310 311 private static char highSurrogate(int codePoint) { 312 return (char) ((codePoint >>> 10) 313 + (Character.MIN_HIGH_SURROGATE - (Character.MIN_SUPPLEMENTARY_CODE_POINT >>> 10))); 314 } 315 316 private static char lowSurrogate(int codePoint) { 317 return (char) ((codePoint & 0x3ff) + Character.MIN_LOW_SURROGATE); 318 } 319 320 /** Write a UTF-8 encoded string. 321 * 322 * @see DataOutput#writeUTF(String) 323 */ 324 public static int writeString(DataOutput out, String s) throws IOException { 325 if (s.length() > 0xffff/3) { // maybe too long 326 LOG.warn("truncating long string: " + s.length() 327 + " chars, starting with " + s.substring(0, 20)); 328 s = s.substring(0, 0xffff/3); 329 } 330 331 int len = utf8Length(s); 332 if (len > 0xffff) // double-check length 333 throw new IOException("string too long!"); 334 335 out.writeShort(len); 336 writeChars(out, s, 0, s.length()); 337 return len; 338 } 339 340 /** Returns the number of bytes required to write this. */ 341 private static int utf8Length(String string) { 342 int stringLength = string.length(); 343 int utf8Length = 0; 344 for (int i = 0; i < stringLength; i++) { 345 int c = string.charAt(i); 346 if (c <= 0x007F) { 347 utf8Length++; 348 } else if (c > 0x07FF) { 349 utf8Length += 3; 350 } else { 351 utf8Length += 2; 352 } 353 } 354 return utf8Length; 355 } 356 357 private static void writeChars(DataOutput out, 358 String s, int start, int length) 359 throws IOException { 360 final int end = start + length; 361 for (int i = start; i < end; i++) { 362 int code = s.charAt(i); 363 if (code <= 0x7F) { 364 out.writeByte((byte)code); 365 } else if (code <= 0x07FF) { 366 out.writeByte((byte)(0xC0 | ((code >> 6) & 0x1F))); 367 out.writeByte((byte)(0x80 | code & 0x3F)); 368 } else { 369 out.writeByte((byte)(0xE0 | ((code >> 12) & 0X0F))); 370 out.writeByte((byte)(0x80 | ((code >> 6) & 0x3F))); 371 out.writeByte((byte)(0x80 | (code & 0x3F))); 372 } 373 } 374 } 375 376}