001 /** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019 package org.apache.hadoop.record; 020 021 import java.io.DataInput; 022 import java.io.DataOutput; 023 import java.io.IOException; 024 025 import org.apache.hadoop.classification.InterfaceAudience; 026 import org.apache.hadoop.classification.InterfaceStability; 027 import org.apache.hadoop.io.WritableComparator; 028 import org.apache.hadoop.io.WritableUtils; 029 030 /** 031 * Various utility functions for Hadooop record I/O runtime. 032 * 033 * @deprecated Replaced by <a href="https://hadoop.apache.org/avro/">Avro</a>. 034 */ 035 @Deprecated 036 @InterfaceAudience.Public 037 @InterfaceStability.Stable 038 public class Utils { 039 040 /** Cannot create a new instance of Utils */ 041 private Utils() { 042 } 043 044 public static final char[] hexchars = { '0', '1', '2', '3', '4', '5', 045 '6', '7', '8', '9', 'A', 'B', 046 'C', 'D', 'E', 'F' }; 047 /** 048 * 049 * @param s 050 * @return 051 */ 052 static String toXMLString(String s) { 053 StringBuilder sb = new StringBuilder(); 054 for (int idx = 0; idx < s.length(); idx++) { 055 char ch = s.charAt(idx); 056 if (ch == '<') { 057 sb.append("<"); 058 } else if (ch == '&') { 059 sb.append("&"); 060 } else if (ch == '%') { 061 sb.append("%0025"); 062 } else if (ch < 0x20 || 063 (ch > 0xD7FF && ch < 0xE000) || 064 (ch > 0xFFFD)) { 065 sb.append("%"); 066 sb.append(hexchars[(ch & 0xF000) >> 12]); 067 sb.append(hexchars[(ch & 0x0F00) >> 8]); 068 sb.append(hexchars[(ch & 0x00F0) >> 4]); 069 sb.append(hexchars[(ch & 0x000F)]); 070 } else { 071 sb.append(ch); 072 } 073 } 074 return sb.toString(); 075 } 076 077 static private int h2c(char ch) { 078 if (ch >= '0' && ch <= '9') { 079 return ch - '0'; 080 } else if (ch >= 'A' && ch <= 'F') { 081 return ch - 'A' + 10; 082 } else if (ch >= 'a' && ch <= 'f') { 083 return ch - 'a' + 10; 084 } 085 return 0; 086 } 087 088 /** 089 * 090 * @param s 091 * @return 092 */ 093 static String fromXMLString(String s) { 094 StringBuilder sb = new StringBuilder(); 095 for (int idx = 0; idx < s.length();) { 096 char ch = s.charAt(idx++); 097 if (ch == '%') { 098 int ch1 = h2c(s.charAt(idx++)) << 12; 099 int ch2 = h2c(s.charAt(idx++)) << 8; 100 int ch3 = h2c(s.charAt(idx++)) << 4; 101 int ch4 = h2c(s.charAt(idx++)); 102 char res = (char)(ch1 | ch2 | ch3 | ch4); 103 sb.append(res); 104 } else { 105 sb.append(ch); 106 } 107 } 108 return sb.toString(); 109 } 110 111 /** 112 * 113 * @param s 114 * @return 115 */ 116 static String toCSVString(String s) { 117 StringBuilder sb = new StringBuilder(s.length()+1); 118 sb.append('\''); 119 int len = s.length(); 120 for (int i = 0; i < len; i++) { 121 char c = s.charAt(i); 122 switch(c) { 123 case '\0': 124 sb.append("%00"); 125 break; 126 case '\n': 127 sb.append("%0A"); 128 break; 129 case '\r': 130 sb.append("%0D"); 131 break; 132 case ',': 133 sb.append("%2C"); 134 break; 135 case '}': 136 sb.append("%7D"); 137 break; 138 case '%': 139 sb.append("%25"); 140 break; 141 default: 142 sb.append(c); 143 } 144 } 145 return sb.toString(); 146 } 147 148 /** 149 * 150 * @param s 151 * @throws java.io.IOException 152 * @return 153 */ 154 static String fromCSVString(String s) throws IOException { 155 if (s.charAt(0) != '\'') { 156 throw new IOException("Error deserializing string."); 157 } 158 int len = s.length(); 159 StringBuilder sb = new StringBuilder(len-1); 160 for (int i = 1; i < len; i++) { 161 char c = s.charAt(i); 162 if (c == '%') { 163 char ch1 = s.charAt(i+1); 164 char ch2 = s.charAt(i+2); 165 i += 2; 166 if (ch1 == '0' && ch2 == '0') { 167 sb.append('\0'); 168 } else if (ch1 == '0' && ch2 == 'A') { 169 sb.append('\n'); 170 } else if (ch1 == '0' && ch2 == 'D') { 171 sb.append('\r'); 172 } else if (ch1 == '2' && ch2 == 'C') { 173 sb.append(','); 174 } else if (ch1 == '7' && ch2 == 'D') { 175 sb.append('}'); 176 } else if (ch1 == '2' && ch2 == '5') { 177 sb.append('%'); 178 } else { 179 throw new IOException("Error deserializing string."); 180 } 181 } else { 182 sb.append(c); 183 } 184 } 185 return sb.toString(); 186 } 187 188 /** 189 * 190 * @param s 191 * @return 192 */ 193 static String toXMLBuffer(Buffer s) { 194 return s.toString(); 195 } 196 197 /** 198 * 199 * @param s 200 * @throws java.io.IOException 201 * @return 202 */ 203 static Buffer fromXMLBuffer(String s) 204 throws IOException { 205 if (s.length() == 0) { return new Buffer(); } 206 int blen = s.length()/2; 207 byte[] barr = new byte[blen]; 208 for (int idx = 0; idx < blen; idx++) { 209 char c1 = s.charAt(2*idx); 210 char c2 = s.charAt(2*idx+1); 211 barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16); 212 } 213 return new Buffer(barr); 214 } 215 216 /** 217 * 218 * @param buf 219 * @return 220 */ 221 static String toCSVBuffer(Buffer buf) { 222 StringBuilder sb = new StringBuilder("#"); 223 sb.append(buf.toString()); 224 return sb.toString(); 225 } 226 227 /** 228 * Converts a CSV-serialized representation of buffer to a new 229 * Buffer 230 * @param s CSV-serialized representation of buffer 231 * @throws java.io.IOException 232 * @return Deserialized Buffer 233 */ 234 static Buffer fromCSVBuffer(String s) 235 throws IOException { 236 if (s.charAt(0) != '#') { 237 throw new IOException("Error deserializing buffer."); 238 } 239 if (s.length() == 1) { return new Buffer(); } 240 int blen = (s.length()-1)/2; 241 byte[] barr = new byte[blen]; 242 for (int idx = 0; idx < blen; idx++) { 243 char c1 = s.charAt(2*idx+1); 244 char c2 = s.charAt(2*idx+2); 245 barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16); 246 } 247 return new Buffer(barr); 248 } 249 250 private static int utf8LenForCodePoint(final int cpt) throws IOException { 251 if (cpt >=0 && cpt <= 0x7F) { 252 return 1; 253 } 254 if (cpt >= 0x80 && cpt <= 0x07FF) { 255 return 2; 256 } 257 if ((cpt >= 0x0800 && cpt < 0xD800) || 258 (cpt > 0xDFFF && cpt <= 0xFFFD)) { 259 return 3; 260 } 261 if (cpt >= 0x10000 && cpt <= 0x10FFFF) { 262 return 4; 263 } 264 throw new IOException("Illegal Unicode Codepoint "+ 265 Integer.toHexString(cpt)+" in string."); 266 } 267 268 private static final int B10 = Integer.parseInt("10000000", 2); 269 private static final int B110 = Integer.parseInt("11000000", 2); 270 private static final int B1110 = Integer.parseInt("11100000", 2); 271 private static final int B11110 = Integer.parseInt("11110000", 2); 272 private static final int B11 = Integer.parseInt("11000000", 2); 273 private static final int B111 = Integer.parseInt("11100000", 2); 274 private static final int B1111 = Integer.parseInt("11110000", 2); 275 private static final int B11111 = Integer.parseInt("11111000", 2); 276 277 private static int writeUtf8(int cpt, final byte[] bytes, final int offset) 278 throws IOException { 279 if (cpt >=0 && cpt <= 0x7F) { 280 bytes[offset] = (byte) cpt; 281 return 1; 282 } 283 if (cpt >= 0x80 && cpt <= 0x07FF) { 284 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); 285 cpt = cpt >> 6; 286 bytes[offset] = (byte) (B110 | (cpt & 0x1F)); 287 return 2; 288 } 289 if ((cpt >= 0x0800 && cpt < 0xD800) || 290 (cpt > 0xDFFF && cpt <= 0xFFFD)) { 291 bytes[offset+2] = (byte) (B10 | (cpt & 0x3F)); 292 cpt = cpt >> 6; 293 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); 294 cpt = cpt >> 6; 295 bytes[offset] = (byte) (B1110 | (cpt & 0x0F)); 296 return 3; 297 } 298 if (cpt >= 0x10000 && cpt <= 0x10FFFF) { 299 bytes[offset+3] = (byte) (B10 | (cpt & 0x3F)); 300 cpt = cpt >> 6; 301 bytes[offset+2] = (byte) (B10 | (cpt & 0x3F)); 302 cpt = cpt >> 6; 303 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F)); 304 cpt = cpt >> 6; 305 bytes[offset] = (byte) (B11110 | (cpt & 0x07)); 306 return 4; 307 } 308 throw new IOException("Illegal Unicode Codepoint "+ 309 Integer.toHexString(cpt)+" in string."); 310 } 311 312 static void toBinaryString(final DataOutput out, final String str) 313 throws IOException { 314 final int strlen = str.length(); 315 byte[] bytes = new byte[strlen*4]; // Codepoints expand to 4 bytes max 316 int utf8Len = 0; 317 int idx = 0; 318 while(idx < strlen) { 319 final int cpt = str.codePointAt(idx); 320 idx += Character.isSupplementaryCodePoint(cpt) ? 2 : 1; 321 utf8Len += writeUtf8(cpt, bytes, utf8Len); 322 } 323 writeVInt(out, utf8Len); 324 out.write(bytes, 0, utf8Len); 325 } 326 327 static boolean isValidCodePoint(int cpt) { 328 return !((cpt > 0x10FFFF) || 329 (cpt >= 0xD800 && cpt <= 0xDFFF) || 330 (cpt >= 0xFFFE && cpt <=0xFFFF)); 331 } 332 333 private static int utf8ToCodePoint(int b1, int b2, int b3, int b4) { 334 int cpt = 0; 335 cpt = (((b1 & ~B11111) << 18) | 336 ((b2 & ~B11) << 12) | 337 ((b3 & ~B11) << 6) | 338 (b4 & ~B11)); 339 return cpt; 340 } 341 342 private static int utf8ToCodePoint(int b1, int b2, int b3) { 343 int cpt = 0; 344 cpt = (((b1 & ~B1111) << 12) | ((b2 & ~B11) << 6) | (b3 & ~B11)); 345 return cpt; 346 } 347 348 private static int utf8ToCodePoint(int b1, int b2) { 349 int cpt = 0; 350 cpt = (((b1 & ~B111) << 6) | (b2 & ~B11)); 351 return cpt; 352 } 353 354 private static void checkB10(int b) throws IOException { 355 if ((b & B11) != B10) { 356 throw new IOException("Invalid UTF-8 representation."); 357 } 358 } 359 360 static String fromBinaryString(final DataInput din) throws IOException { 361 final int utf8Len = readVInt(din); 362 final byte[] bytes = new byte[utf8Len]; 363 din.readFully(bytes); 364 int len = 0; 365 // For the most commmon case, i.e. ascii, numChars = utf8Len 366 StringBuilder sb = new StringBuilder(utf8Len); 367 while(len < utf8Len) { 368 int cpt = 0; 369 final int b1 = bytes[len++] & 0xFF; 370 if (b1 <= 0x7F) { 371 cpt = b1; 372 } else if ((b1 & B11111) == B11110) { 373 int b2 = bytes[len++] & 0xFF; 374 checkB10(b2); 375 int b3 = bytes[len++] & 0xFF; 376 checkB10(b3); 377 int b4 = bytes[len++] & 0xFF; 378 checkB10(b4); 379 cpt = utf8ToCodePoint(b1, b2, b3, b4); 380 } else if ((b1 & B1111) == B1110) { 381 int b2 = bytes[len++] & 0xFF; 382 checkB10(b2); 383 int b3 = bytes[len++] & 0xFF; 384 checkB10(b3); 385 cpt = utf8ToCodePoint(b1, b2, b3); 386 } else if ((b1 & B111) == B110) { 387 int b2 = bytes[len++] & 0xFF; 388 checkB10(b2); 389 cpt = utf8ToCodePoint(b1, b2); 390 } else { 391 throw new IOException("Invalid UTF-8 byte "+Integer.toHexString(b1)+ 392 " at offset "+(len-1)+" in length of "+utf8Len); 393 } 394 if (!isValidCodePoint(cpt)) { 395 throw new IOException("Illegal Unicode Codepoint "+ 396 Integer.toHexString(cpt)+" in stream."); 397 } 398 sb.appendCodePoint(cpt); 399 } 400 return sb.toString(); 401 } 402 403 /** Parse a float from a byte array. */ 404 public static float readFloat(byte[] bytes, int start) { 405 return WritableComparator.readFloat(bytes, start); 406 } 407 408 /** Parse a double from a byte array. */ 409 public static double readDouble(byte[] bytes, int start) { 410 return WritableComparator.readDouble(bytes, start); 411 } 412 413 /** 414 * Reads a zero-compressed encoded long from a byte array and returns it. 415 * @param bytes byte array with decode long 416 * @param start starting index 417 * @throws java.io.IOException 418 * @return deserialized long 419 */ 420 public static long readVLong(byte[] bytes, int start) throws IOException { 421 return WritableComparator.readVLong(bytes, start); 422 } 423 424 /** 425 * Reads a zero-compressed encoded integer from a byte array and returns it. 426 * @param bytes byte array with the encoded integer 427 * @param start start index 428 * @throws java.io.IOException 429 * @return deserialized integer 430 */ 431 public static int readVInt(byte[] bytes, int start) throws IOException { 432 return WritableComparator.readVInt(bytes, start); 433 } 434 435 /** 436 * Reads a zero-compressed encoded long from a stream and return it. 437 * @param in input stream 438 * @throws java.io.IOException 439 * @return deserialized long 440 */ 441 public static long readVLong(DataInput in) throws IOException { 442 return WritableUtils.readVLong(in); 443 } 444 445 /** 446 * Reads a zero-compressed encoded integer from a stream and returns it. 447 * @param in input stream 448 * @throws java.io.IOException 449 * @return deserialized integer 450 */ 451 public static int readVInt(DataInput in) throws IOException { 452 return WritableUtils.readVInt(in); 453 } 454 455 /** 456 * Get the encoded length if an integer is stored in a variable-length format 457 * @return the encoded length 458 */ 459 public static int getVIntSize(long i) { 460 return WritableUtils.getVIntSize(i); 461 } 462 463 /** 464 * Serializes a long to a binary stream with zero-compressed encoding. 465 * For -112 <= i <= 127, only one byte is used with the actual value. 466 * For other values of i, the first byte value indicates whether the 467 * long is positive or negative, and the number of bytes that follow. 468 * If the first byte value v is between -113 and -120, the following long 469 * is positive, with number of bytes that follow are -(v+112). 470 * If the first byte value v is between -121 and -128, the following long 471 * is negative, with number of bytes that follow are -(v+120). Bytes are 472 * stored in the high-non-zero-byte-first order. 473 * 474 * @param stream Binary output stream 475 * @param i Long to be serialized 476 * @throws java.io.IOException 477 */ 478 public static void writeVLong(DataOutput stream, long i) throws IOException { 479 WritableUtils.writeVLong(stream, i); 480 } 481 482 /** 483 * Serializes an int to a binary stream with zero-compressed encoding. 484 * 485 * @param stream Binary output stream 486 * @param i int to be serialized 487 * @throws java.io.IOException 488 */ 489 public static void writeVInt(DataOutput stream, int i) throws IOException { 490 WritableUtils.writeVInt(stream, i); 491 } 492 493 /** Lexicographic order of binary data. */ 494 public static int compareBytes(byte[] b1, int s1, int l1, 495 byte[] b2, int s2, int l2) { 496 return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2); 497 } 498 }