001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.record;
020
021import java.io.DataInput;
022import java.io.DataOutput;
023import java.io.IOException;
024
025import org.apache.hadoop.classification.InterfaceAudience;
026import org.apache.hadoop.classification.InterfaceStability;
027import org.apache.hadoop.io.WritableComparator;
028import org.apache.hadoop.io.WritableUtils;
029
030/**
031 * Various utility functions for Hadoop record I/O runtime.
032 * 
033 * @deprecated Replaced by <a href="http://avro.apache.org/">Avro</a>.
034 */
035@Deprecated
036@InterfaceAudience.Public
037@InterfaceStability.Stable
038public class Utils {
039  
040  /** Cannot create a new instance of Utils */
041  private Utils() {
042  }
043  
044  public static final char[] hexchars = { '0', '1', '2', '3', '4', '5',
045                                          '6', '7', '8', '9', 'A', 'B',
046                                          'C', 'D', 'E', 'F' };
047  /**
048   *
049   * @param s
050   * @return
051   */
052  static String toXMLString(String s) {
053    StringBuilder sb = new StringBuilder();
054    for (int idx = 0; idx < s.length(); idx++) {
055      char ch = s.charAt(idx);
056      if (ch == '<') {
057        sb.append("&lt;");
058      } else if (ch == '&') {
059        sb.append("&amp;");
060      } else if (ch == '%') {
061        sb.append("%0025");
062      } else if (ch < 0x20 ||
063                 (ch > 0xD7FF && ch < 0xE000) ||
064                 (ch > 0xFFFD)) {
065        sb.append("%");
066        sb.append(hexchars[(ch & 0xF000) >> 12]);
067        sb.append(hexchars[(ch & 0x0F00) >> 8]);
068        sb.append(hexchars[(ch & 0x00F0) >> 4]);
069        sb.append(hexchars[(ch & 0x000F)]);
070      } else {
071        sb.append(ch);
072      }
073    }
074    return sb.toString();
075  }
076  
077  static private int h2c(char ch) {
078    if (ch >= '0' && ch <= '9') {
079      return ch - '0';
080    } else if (ch >= 'A' && ch <= 'F') {
081      return ch - 'A' + 10;
082    } else if (ch >= 'a' && ch <= 'f') {
083      return ch - 'a' + 10;
084    }
085    return 0;
086  }
087  
088  /**
089   *
090   * @param s
091   * @return
092   */
093  static String fromXMLString(String s) {
094    StringBuilder sb = new StringBuilder();
095    for (int idx = 0; idx < s.length();) {
096      char ch = s.charAt(idx++);
097      if (ch == '%') {
098        int ch1 = h2c(s.charAt(idx++)) << 12;
099        int ch2 = h2c(s.charAt(idx++)) << 8;
100        int ch3 = h2c(s.charAt(idx++)) << 4;
101        int ch4 = h2c(s.charAt(idx++));
102        char res = (char)(ch1 | ch2 | ch3 | ch4);
103        sb.append(res);
104      } else {
105        sb.append(ch);
106      }
107    }
108    return sb.toString();
109  }
110  
111  /**
112   *
113   * @param s
114   * @return
115   */
116  static String toCSVString(String s) {
117    StringBuilder sb = new StringBuilder(s.length()+1);
118    sb.append('\'');
119    int len = s.length();
120    for (int i = 0; i < len; i++) {
121      char c = s.charAt(i);
122      switch(c) {
123      case '\0':
124        sb.append("%00");
125        break;
126      case '\n':
127        sb.append("%0A");
128        break;
129      case '\r':
130        sb.append("%0D");
131        break;
132      case ',':
133        sb.append("%2C");
134        break;
135      case '}':
136        sb.append("%7D");
137        break;
138      case '%':
139        sb.append("%25");
140        break;
141      default:
142        sb.append(c);
143      }
144    }
145    return sb.toString();
146  }
147  
148  /**
149   *
150   * @param s
151   * @throws java.io.IOException
152   * @return
153   */
154  static String fromCSVString(String s) throws IOException {
155    if (s.charAt(0) != '\'') {
156      throw new IOException("Error deserializing string.");
157    }
158    int len = s.length();
159    StringBuilder sb = new StringBuilder(len-1);
160    for (int i = 1; i < len; i++) {
161      char c = s.charAt(i);
162      if (c == '%') {
163        char ch1 = s.charAt(i+1);
164        char ch2 = s.charAt(i+2);
165        i += 2;
166        if (ch1 == '0' && ch2 == '0') {
167          sb.append('\0');
168        } else if (ch1 == '0' && ch2 == 'A') {
169          sb.append('\n');
170        } else if (ch1 == '0' && ch2 == 'D') {
171          sb.append('\r');
172        } else if (ch1 == '2' && ch2 == 'C') {
173          sb.append(',');
174        } else if (ch1 == '7' && ch2 == 'D') {
175          sb.append('}');
176        } else if (ch1 == '2' && ch2 == '5') {
177          sb.append('%');
178        } else {
179          throw new IOException("Error deserializing string.");
180        }
181      } else {
182        sb.append(c);
183      }
184    }
185    return sb.toString();
186  }
187  
188  /**
189   *
190   * @param s
191   * @return
192   */
193  static String toXMLBuffer(Buffer s) {
194    return s.toString();
195  }
196  
197  /**
198   *
199   * @param s
200   * @throws java.io.IOException
201   * @return
202   */
203  static Buffer fromXMLBuffer(String s)
204    throws IOException {
205    if (s.length() == 0) { return new Buffer(); }
206    int blen = s.length()/2;
207    byte[] barr = new byte[blen];
208    for (int idx = 0; idx < blen; idx++) {
209      char c1 = s.charAt(2*idx);
210      char c2 = s.charAt(2*idx+1);
211      barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16);
212    }
213    return new Buffer(barr);
214  }
215  
216  /**
217   *
218   * @param buf
219   * @return
220   */
221  static String toCSVBuffer(Buffer buf) {
222    StringBuilder sb = new StringBuilder("#");
223    sb.append(buf.toString());
224    return sb.toString();
225  }
226  
227  /**
228   * Converts a CSV-serialized representation of buffer to a new
229   * Buffer
230   * @param s CSV-serialized representation of buffer
231   * @throws java.io.IOException
232   * @return Deserialized Buffer
233   */
234  static Buffer fromCSVBuffer(String s)
235    throws IOException {
236    if (s.charAt(0) != '#') {
237      throw new IOException("Error deserializing buffer.");
238    }
239    if (s.length() == 1) { return new Buffer(); }
240    int blen = (s.length()-1)/2;
241    byte[] barr = new byte[blen];
242    for (int idx = 0; idx < blen; idx++) {
243      char c1 = s.charAt(2*idx+1);
244      char c2 = s.charAt(2*idx+2);
245      barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16);
246    }
247    return new Buffer(barr);
248  }
249  
250  private static int utf8LenForCodePoint(final int cpt) throws IOException {
251    if (cpt >=0 && cpt <= 0x7F) {
252      return 1;
253    }
254    if (cpt >= 0x80 && cpt <= 0x07FF) {
255      return 2;
256    }
257    if ((cpt >= 0x0800 && cpt < 0xD800) ||
258        (cpt > 0xDFFF && cpt <= 0xFFFD)) {
259      return 3;
260    }
261    if (cpt >= 0x10000 && cpt <= 0x10FFFF) {
262      return 4;
263    }
264    throw new IOException("Illegal Unicode Codepoint "+
265                          Integer.toHexString(cpt)+" in string.");
266  }
267  
268  private static final int B10 =    Integer.parseInt("10000000", 2);
269  private static final int B110 =   Integer.parseInt("11000000", 2);
270  private static final int B1110 =  Integer.parseInt("11100000", 2);
271  private static final int B11110 = Integer.parseInt("11110000", 2);
272  private static final int B11 =    Integer.parseInt("11000000", 2);
273  private static final int B111 =   Integer.parseInt("11100000", 2);
274  private static final int B1111 =  Integer.parseInt("11110000", 2);
275  private static final int B11111 = Integer.parseInt("11111000", 2);
276  
277  private static int writeUtf8(int cpt, final byte[] bytes, final int offset)
278    throws IOException {
279    if (cpt >=0 && cpt <= 0x7F) {
280      bytes[offset] = (byte) cpt;
281      return 1;
282    }
283    if (cpt >= 0x80 && cpt <= 0x07FF) {
284      bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
285      cpt = cpt >> 6;
286      bytes[offset] = (byte) (B110 | (cpt & 0x1F));
287      return 2;
288    }
289    if ((cpt >= 0x0800 && cpt < 0xD800) ||
290        (cpt > 0xDFFF && cpt <= 0xFFFD)) {
291      bytes[offset+2] = (byte) (B10 | (cpt & 0x3F));
292      cpt = cpt >> 6;
293      bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
294      cpt = cpt >> 6;
295      bytes[offset] = (byte) (B1110 | (cpt & 0x0F));
296      return 3;
297    }
298    if (cpt >= 0x10000 && cpt <= 0x10FFFF) {
299      bytes[offset+3] = (byte) (B10 | (cpt & 0x3F));
300      cpt = cpt >> 6;
301      bytes[offset+2] = (byte) (B10 | (cpt & 0x3F));
302      cpt = cpt >> 6;
303      bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
304      cpt = cpt >> 6;
305      bytes[offset] = (byte) (B11110 | (cpt & 0x07));
306      return 4;
307    }
308    throw new IOException("Illegal Unicode Codepoint "+
309                          Integer.toHexString(cpt)+" in string.");
310  }
311  
312  static void toBinaryString(final DataOutput out, final String str)
313    throws IOException {
314    final int strlen = str.length();
315    byte[] bytes = new byte[strlen*4]; // Codepoints expand to 4 bytes max
316    int utf8Len = 0;
317    int idx = 0;
318    while(idx < strlen) {
319      final int cpt = str.codePointAt(idx);
320      idx += Character.isSupplementaryCodePoint(cpt) ? 2 : 1;
321      utf8Len += writeUtf8(cpt, bytes, utf8Len);
322    }
323    writeVInt(out, utf8Len);
324    out.write(bytes, 0, utf8Len);
325  }
326  
327  static boolean isValidCodePoint(int cpt) {
328    return !((cpt > 0x10FFFF) ||
329             (cpt >= 0xD800 && cpt <= 0xDFFF) ||
330             (cpt >= 0xFFFE && cpt <=0xFFFF));
331  }
332  
333  private static int utf8ToCodePoint(int b1, int b2, int b3, int b4) {
334    int cpt = 0;
335    cpt = (((b1 & ~B11111) << 18) |
336           ((b2 & ~B11) << 12) |
337           ((b3 & ~B11) << 6) |
338           (b4 & ~B11));
339    return cpt;
340  }
341  
342  private static int utf8ToCodePoint(int b1, int b2, int b3) {
343    int cpt = 0;
344    cpt = (((b1 & ~B1111) << 12) | ((b2 & ~B11) << 6) | (b3 & ~B11));
345    return cpt;
346  }
347  
348  private static int utf8ToCodePoint(int b1, int b2) {
349    int cpt = 0;
350    cpt = (((b1 & ~B111) << 6) | (b2 & ~B11));
351    return cpt;
352  }
353  
354  private static void checkB10(int b) throws IOException {
355    if ((b & B11) != B10) {
356      throw new IOException("Invalid UTF-8 representation.");
357    }
358  }
359  
360  static String fromBinaryString(final DataInput din) throws IOException {
361    final int utf8Len = readVInt(din);
362    final byte[] bytes = new byte[utf8Len];
363    din.readFully(bytes);
364    int len = 0;
365    // For the most commmon case, i.e. ascii, numChars = utf8Len
366    StringBuilder sb = new StringBuilder(utf8Len);
367    while(len < utf8Len) {
368      int cpt = 0;
369      final int b1 = bytes[len++] & 0xFF;
370      if (b1 <= 0x7F) {
371        cpt = b1;
372      } else if ((b1 & B11111) == B11110) {
373        int b2 = bytes[len++] & 0xFF;
374        checkB10(b2);
375        int b3 = bytes[len++] & 0xFF;
376        checkB10(b3);
377        int b4 = bytes[len++] & 0xFF;
378        checkB10(b4);
379        cpt = utf8ToCodePoint(b1, b2, b3, b4);
380      } else if ((b1 & B1111) == B1110) {
381        int b2 = bytes[len++] & 0xFF;
382        checkB10(b2);
383        int b3 = bytes[len++] & 0xFF;
384        checkB10(b3);
385        cpt = utf8ToCodePoint(b1, b2, b3);
386      } else if ((b1 & B111) == B110) {
387        int b2 = bytes[len++] & 0xFF;
388        checkB10(b2);
389        cpt = utf8ToCodePoint(b1, b2);
390      } else {
391        throw new IOException("Invalid UTF-8 byte "+Integer.toHexString(b1)+
392                              " at offset "+(len-1)+" in length of "+utf8Len);
393      }
394      if (!isValidCodePoint(cpt)) {
395        throw new IOException("Illegal Unicode Codepoint "+
396                              Integer.toHexString(cpt)+" in stream.");
397      }
398      sb.appendCodePoint(cpt);
399    }
400    return sb.toString();
401  }
402  
403  /** Parse a float from a byte array. */
404  public static float readFloat(byte[] bytes, int start) {
405    return WritableComparator.readFloat(bytes, start);
406  }
407  
408  /** Parse a double from a byte array. */
409  public static double readDouble(byte[] bytes, int start) {
410    return WritableComparator.readDouble(bytes, start);
411  }
412  
413  /**
414   * Reads a zero-compressed encoded long from a byte array and returns it.
415   * @param bytes byte array with decode long
416   * @param start starting index
417   * @throws java.io.IOException
418   * @return deserialized long
419   */
420  public static long readVLong(byte[] bytes, int start) throws IOException {
421    return WritableComparator.readVLong(bytes, start);
422  }
423  
424  /**
425   * Reads a zero-compressed encoded integer from a byte array and returns it.
426   * @param bytes byte array with the encoded integer
427   * @param start start index
428   * @throws java.io.IOException
429   * @return deserialized integer
430   */
431  public static int readVInt(byte[] bytes, int start) throws IOException {
432    return WritableComparator.readVInt(bytes, start);
433  }
434  
435  /**
436   * Reads a zero-compressed encoded long from a stream and return it.
437   * @param in input stream
438   * @throws java.io.IOException
439   * @return deserialized long
440   */
441  public static long readVLong(DataInput in) throws IOException {
442    return WritableUtils.readVLong(in);
443  }
444  
445  /**
446   * Reads a zero-compressed encoded integer from a stream and returns it.
447   * @param in input stream
448   * @throws java.io.IOException
449   * @return deserialized integer
450   */
451  public static int readVInt(DataInput in) throws IOException {
452    return WritableUtils.readVInt(in);
453  }
454  
455  /**
456   * Get the encoded length if an integer is stored in a variable-length format
457   * @return the encoded length
458   */
459  public static int getVIntSize(long i) {
460    return WritableUtils.getVIntSize(i);
461  }
462  
463  /**
464   * Serializes a long to a binary stream with zero-compressed encoding.
465   * For {@literal -112 <= i <= 127}, only one byte is used with the actual
466   * value. For other values of i, the first byte value indicates whether the
467   * long is positive or negative, and the number of bytes that follow.
468   * If the first byte value v is between -113 and -120, the following long
469   * is positive, with number of bytes that follow are -(v+112).
470   * If the first byte value v is between -121 and -128, the following long
471   * is negative, with number of bytes that follow are -(v+120). Bytes are
472   * stored in the high-non-zero-byte-first order.
473   *
474   * @param stream Binary output stream
475   * @param i Long to be serialized
476   * @throws java.io.IOException
477   */
478  public static void writeVLong(DataOutput stream, long i) throws IOException {
479    WritableUtils.writeVLong(stream, i);
480  }
481  
482  /**
483   * Serializes an int to a binary stream with zero-compressed encoding.
484   *
485   * @param stream Binary output stream
486   * @param i int to be serialized
487   * @throws java.io.IOException
488   */
489  public static void writeVInt(DataOutput stream, int i) throws IOException {
490    WritableUtils.writeVInt(stream, i);
491  }
492  
493  /** Lexicographic order of binary data. */
494  public static int compareBytes(byte[] b1, int s1, int l1,
495                                 byte[] b2, int s2, int l2) {
496    return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2);
497  }
498}