001 /**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements. See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership. The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License. You may obtain a copy of the License at
009 *
010 * http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019 package org.apache.hadoop.record;
020
021 import java.io.DataInput;
022 import java.io.DataOutput;
023 import java.io.IOException;
024
025 import org.apache.hadoop.classification.InterfaceAudience;
026 import org.apache.hadoop.classification.InterfaceStability;
027 import org.apache.hadoop.io.WritableComparator;
028 import org.apache.hadoop.io.WritableUtils;
029
030 /**
031 * Various utility functions for Hadooop record I/O runtime.
032 *
033 * @deprecated Replaced by <a href="https://hadoop.apache.org/avro/">Avro</a>.
034 */
035 @Deprecated
036 @InterfaceAudience.Public
037 @InterfaceStability.Stable
038 public class Utils {
039
040 /** Cannot create a new instance of Utils */
041 private Utils() {
042 }
043
044 public static final char[] hexchars = { '0', '1', '2', '3', '4', '5',
045 '6', '7', '8', '9', 'A', 'B',
046 'C', 'D', 'E', 'F' };
047 /**
048 *
049 * @param s
050 * @return
051 */
052 static String toXMLString(String s) {
053 StringBuilder sb = new StringBuilder();
054 for (int idx = 0; idx < s.length(); idx++) {
055 char ch = s.charAt(idx);
056 if (ch == '<') {
057 sb.append("<");
058 } else if (ch == '&') {
059 sb.append("&");
060 } else if (ch == '%') {
061 sb.append("%0025");
062 } else if (ch < 0x20 ||
063 (ch > 0xD7FF && ch < 0xE000) ||
064 (ch > 0xFFFD)) {
065 sb.append("%");
066 sb.append(hexchars[(ch & 0xF000) >> 12]);
067 sb.append(hexchars[(ch & 0x0F00) >> 8]);
068 sb.append(hexchars[(ch & 0x00F0) >> 4]);
069 sb.append(hexchars[(ch & 0x000F)]);
070 } else {
071 sb.append(ch);
072 }
073 }
074 return sb.toString();
075 }
076
077 static private int h2c(char ch) {
078 if (ch >= '0' && ch <= '9') {
079 return ch - '0';
080 } else if (ch >= 'A' && ch <= 'F') {
081 return ch - 'A' + 10;
082 } else if (ch >= 'a' && ch <= 'f') {
083 return ch - 'a' + 10;
084 }
085 return 0;
086 }
087
088 /**
089 *
090 * @param s
091 * @return
092 */
093 static String fromXMLString(String s) {
094 StringBuilder sb = new StringBuilder();
095 for (int idx = 0; idx < s.length();) {
096 char ch = s.charAt(idx++);
097 if (ch == '%') {
098 int ch1 = h2c(s.charAt(idx++)) << 12;
099 int ch2 = h2c(s.charAt(idx++)) << 8;
100 int ch3 = h2c(s.charAt(idx++)) << 4;
101 int ch4 = h2c(s.charAt(idx++));
102 char res = (char)(ch1 | ch2 | ch3 | ch4);
103 sb.append(res);
104 } else {
105 sb.append(ch);
106 }
107 }
108 return sb.toString();
109 }
110
111 /**
112 *
113 * @param s
114 * @return
115 */
116 static String toCSVString(String s) {
117 StringBuilder sb = new StringBuilder(s.length()+1);
118 sb.append('\'');
119 int len = s.length();
120 for (int i = 0; i < len; i++) {
121 char c = s.charAt(i);
122 switch(c) {
123 case '\0':
124 sb.append("%00");
125 break;
126 case '\n':
127 sb.append("%0A");
128 break;
129 case '\r':
130 sb.append("%0D");
131 break;
132 case ',':
133 sb.append("%2C");
134 break;
135 case '}':
136 sb.append("%7D");
137 break;
138 case '%':
139 sb.append("%25");
140 break;
141 default:
142 sb.append(c);
143 }
144 }
145 return sb.toString();
146 }
147
148 /**
149 *
150 * @param s
151 * @throws java.io.IOException
152 * @return
153 */
154 static String fromCSVString(String s) throws IOException {
155 if (s.charAt(0) != '\'') {
156 throw new IOException("Error deserializing string.");
157 }
158 int len = s.length();
159 StringBuilder sb = new StringBuilder(len-1);
160 for (int i = 1; i < len; i++) {
161 char c = s.charAt(i);
162 if (c == '%') {
163 char ch1 = s.charAt(i+1);
164 char ch2 = s.charAt(i+2);
165 i += 2;
166 if (ch1 == '0' && ch2 == '0') {
167 sb.append('\0');
168 } else if (ch1 == '0' && ch2 == 'A') {
169 sb.append('\n');
170 } else if (ch1 == '0' && ch2 == 'D') {
171 sb.append('\r');
172 } else if (ch1 == '2' && ch2 == 'C') {
173 sb.append(',');
174 } else if (ch1 == '7' && ch2 == 'D') {
175 sb.append('}');
176 } else if (ch1 == '2' && ch2 == '5') {
177 sb.append('%');
178 } else {
179 throw new IOException("Error deserializing string.");
180 }
181 } else {
182 sb.append(c);
183 }
184 }
185 return sb.toString();
186 }
187
188 /**
189 *
190 * @param s
191 * @return
192 */
193 static String toXMLBuffer(Buffer s) {
194 return s.toString();
195 }
196
197 /**
198 *
199 * @param s
200 * @throws java.io.IOException
201 * @return
202 */
203 static Buffer fromXMLBuffer(String s)
204 throws IOException {
205 if (s.length() == 0) { return new Buffer(); }
206 int blen = s.length()/2;
207 byte[] barr = new byte[blen];
208 for (int idx = 0; idx < blen; idx++) {
209 char c1 = s.charAt(2*idx);
210 char c2 = s.charAt(2*idx+1);
211 barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16);
212 }
213 return new Buffer(barr);
214 }
215
216 /**
217 *
218 * @param buf
219 * @return
220 */
221 static String toCSVBuffer(Buffer buf) {
222 StringBuilder sb = new StringBuilder("#");
223 sb.append(buf.toString());
224 return sb.toString();
225 }
226
227 /**
228 * Converts a CSV-serialized representation of buffer to a new
229 * Buffer
230 * @param s CSV-serialized representation of buffer
231 * @throws java.io.IOException
232 * @return Deserialized Buffer
233 */
234 static Buffer fromCSVBuffer(String s)
235 throws IOException {
236 if (s.charAt(0) != '#') {
237 throw new IOException("Error deserializing buffer.");
238 }
239 if (s.length() == 1) { return new Buffer(); }
240 int blen = (s.length()-1)/2;
241 byte[] barr = new byte[blen];
242 for (int idx = 0; idx < blen; idx++) {
243 char c1 = s.charAt(2*idx+1);
244 char c2 = s.charAt(2*idx+2);
245 barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16);
246 }
247 return new Buffer(barr);
248 }
249
250 private static int utf8LenForCodePoint(final int cpt) throws IOException {
251 if (cpt >=0 && cpt <= 0x7F) {
252 return 1;
253 }
254 if (cpt >= 0x80 && cpt <= 0x07FF) {
255 return 2;
256 }
257 if ((cpt >= 0x0800 && cpt < 0xD800) ||
258 (cpt > 0xDFFF && cpt <= 0xFFFD)) {
259 return 3;
260 }
261 if (cpt >= 0x10000 && cpt <= 0x10FFFF) {
262 return 4;
263 }
264 throw new IOException("Illegal Unicode Codepoint "+
265 Integer.toHexString(cpt)+" in string.");
266 }
267
268 private static final int B10 = Integer.parseInt("10000000", 2);
269 private static final int B110 = Integer.parseInt("11000000", 2);
270 private static final int B1110 = Integer.parseInt("11100000", 2);
271 private static final int B11110 = Integer.parseInt("11110000", 2);
272 private static final int B11 = Integer.parseInt("11000000", 2);
273 private static final int B111 = Integer.parseInt("11100000", 2);
274 private static final int B1111 = Integer.parseInt("11110000", 2);
275 private static final int B11111 = Integer.parseInt("11111000", 2);
276
277 private static int writeUtf8(int cpt, final byte[] bytes, final int offset)
278 throws IOException {
279 if (cpt >=0 && cpt <= 0x7F) {
280 bytes[offset] = (byte) cpt;
281 return 1;
282 }
283 if (cpt >= 0x80 && cpt <= 0x07FF) {
284 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
285 cpt = cpt >> 6;
286 bytes[offset] = (byte) (B110 | (cpt & 0x1F));
287 return 2;
288 }
289 if ((cpt >= 0x0800 && cpt < 0xD800) ||
290 (cpt > 0xDFFF && cpt <= 0xFFFD)) {
291 bytes[offset+2] = (byte) (B10 | (cpt & 0x3F));
292 cpt = cpt >> 6;
293 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
294 cpt = cpt >> 6;
295 bytes[offset] = (byte) (B1110 | (cpt & 0x0F));
296 return 3;
297 }
298 if (cpt >= 0x10000 && cpt <= 0x10FFFF) {
299 bytes[offset+3] = (byte) (B10 | (cpt & 0x3F));
300 cpt = cpt >> 6;
301 bytes[offset+2] = (byte) (B10 | (cpt & 0x3F));
302 cpt = cpt >> 6;
303 bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
304 cpt = cpt >> 6;
305 bytes[offset] = (byte) (B11110 | (cpt & 0x07));
306 return 4;
307 }
308 throw new IOException("Illegal Unicode Codepoint "+
309 Integer.toHexString(cpt)+" in string.");
310 }
311
312 static void toBinaryString(final DataOutput out, final String str)
313 throws IOException {
314 final int strlen = str.length();
315 byte[] bytes = new byte[strlen*4]; // Codepoints expand to 4 bytes max
316 int utf8Len = 0;
317 int idx = 0;
318 while(idx < strlen) {
319 final int cpt = str.codePointAt(idx);
320 idx += Character.isSupplementaryCodePoint(cpt) ? 2 : 1;
321 utf8Len += writeUtf8(cpt, bytes, utf8Len);
322 }
323 writeVInt(out, utf8Len);
324 out.write(bytes, 0, utf8Len);
325 }
326
327 static boolean isValidCodePoint(int cpt) {
328 return !((cpt > 0x10FFFF) ||
329 (cpt >= 0xD800 && cpt <= 0xDFFF) ||
330 (cpt >= 0xFFFE && cpt <=0xFFFF));
331 }
332
333 private static int utf8ToCodePoint(int b1, int b2, int b3, int b4) {
334 int cpt = 0;
335 cpt = (((b1 & ~B11111) << 18) |
336 ((b2 & ~B11) << 12) |
337 ((b3 & ~B11) << 6) |
338 (b4 & ~B11));
339 return cpt;
340 }
341
342 private static int utf8ToCodePoint(int b1, int b2, int b3) {
343 int cpt = 0;
344 cpt = (((b1 & ~B1111) << 12) | ((b2 & ~B11) << 6) | (b3 & ~B11));
345 return cpt;
346 }
347
348 private static int utf8ToCodePoint(int b1, int b2) {
349 int cpt = 0;
350 cpt = (((b1 & ~B111) << 6) | (b2 & ~B11));
351 return cpt;
352 }
353
354 private static void checkB10(int b) throws IOException {
355 if ((b & B11) != B10) {
356 throw new IOException("Invalid UTF-8 representation.");
357 }
358 }
359
360 static String fromBinaryString(final DataInput din) throws IOException {
361 final int utf8Len = readVInt(din);
362 final byte[] bytes = new byte[utf8Len];
363 din.readFully(bytes);
364 int len = 0;
365 // For the most commmon case, i.e. ascii, numChars = utf8Len
366 StringBuilder sb = new StringBuilder(utf8Len);
367 while(len < utf8Len) {
368 int cpt = 0;
369 final int b1 = bytes[len++] & 0xFF;
370 if (b1 <= 0x7F) {
371 cpt = b1;
372 } else if ((b1 & B11111) == B11110) {
373 int b2 = bytes[len++] & 0xFF;
374 checkB10(b2);
375 int b3 = bytes[len++] & 0xFF;
376 checkB10(b3);
377 int b4 = bytes[len++] & 0xFF;
378 checkB10(b4);
379 cpt = utf8ToCodePoint(b1, b2, b3, b4);
380 } else if ((b1 & B1111) == B1110) {
381 int b2 = bytes[len++] & 0xFF;
382 checkB10(b2);
383 int b3 = bytes[len++] & 0xFF;
384 checkB10(b3);
385 cpt = utf8ToCodePoint(b1, b2, b3);
386 } else if ((b1 & B111) == B110) {
387 int b2 = bytes[len++] & 0xFF;
388 checkB10(b2);
389 cpt = utf8ToCodePoint(b1, b2);
390 } else {
391 throw new IOException("Invalid UTF-8 byte "+Integer.toHexString(b1)+
392 " at offset "+(len-1)+" in length of "+utf8Len);
393 }
394 if (!isValidCodePoint(cpt)) {
395 throw new IOException("Illegal Unicode Codepoint "+
396 Integer.toHexString(cpt)+" in stream.");
397 }
398 sb.appendCodePoint(cpt);
399 }
400 return sb.toString();
401 }
402
403 /** Parse a float from a byte array. */
404 public static float readFloat(byte[] bytes, int start) {
405 return WritableComparator.readFloat(bytes, start);
406 }
407
408 /** Parse a double from a byte array. */
409 public static double readDouble(byte[] bytes, int start) {
410 return WritableComparator.readDouble(bytes, start);
411 }
412
413 /**
414 * Reads a zero-compressed encoded long from a byte array and returns it.
415 * @param bytes byte array with decode long
416 * @param start starting index
417 * @throws java.io.IOException
418 * @return deserialized long
419 */
420 public static long readVLong(byte[] bytes, int start) throws IOException {
421 return WritableComparator.readVLong(bytes, start);
422 }
423
424 /**
425 * Reads a zero-compressed encoded integer from a byte array and returns it.
426 * @param bytes byte array with the encoded integer
427 * @param start start index
428 * @throws java.io.IOException
429 * @return deserialized integer
430 */
431 public static int readVInt(byte[] bytes, int start) throws IOException {
432 return WritableComparator.readVInt(bytes, start);
433 }
434
435 /**
436 * Reads a zero-compressed encoded long from a stream and return it.
437 * @param in input stream
438 * @throws java.io.IOException
439 * @return deserialized long
440 */
441 public static long readVLong(DataInput in) throws IOException {
442 return WritableUtils.readVLong(in);
443 }
444
445 /**
446 * Reads a zero-compressed encoded integer from a stream and returns it.
447 * @param in input stream
448 * @throws java.io.IOException
449 * @return deserialized integer
450 */
451 public static int readVInt(DataInput in) throws IOException {
452 return WritableUtils.readVInt(in);
453 }
454
455 /**
456 * Get the encoded length if an integer is stored in a variable-length format
457 * @return the encoded length
458 */
459 public static int getVIntSize(long i) {
460 return WritableUtils.getVIntSize(i);
461 }
462
463 /**
464 * Serializes a long to a binary stream with zero-compressed encoding.
465 * For -112 <= i <= 127, only one byte is used with the actual value.
466 * For other values of i, the first byte value indicates whether the
467 * long is positive or negative, and the number of bytes that follow.
468 * If the first byte value v is between -113 and -120, the following long
469 * is positive, with number of bytes that follow are -(v+112).
470 * If the first byte value v is between -121 and -128, the following long
471 * is negative, with number of bytes that follow are -(v+120). Bytes are
472 * stored in the high-non-zero-byte-first order.
473 *
474 * @param stream Binary output stream
475 * @param i Long to be serialized
476 * @throws java.io.IOException
477 */
478 public static void writeVLong(DataOutput stream, long i) throws IOException {
479 WritableUtils.writeVLong(stream, i);
480 }
481
482 /**
483 * Serializes an int to a binary stream with zero-compressed encoding.
484 *
485 * @param stream Binary output stream
486 * @param i int to be serialized
487 * @throws java.io.IOException
488 */
489 public static void writeVInt(DataOutput stream, int i) throws IOException {
490 WritableUtils.writeVInt(stream, i);
491 }
492
493 /** Lexicographic order of binary data. */
494 public static int compareBytes(byte[] b1, int s1, int l1,
495 byte[] b2, int s2, int l2) {
496 return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2);
497 }
498 }