001    /**
002     * Licensed to the Apache Software Foundation (ASF) under one
003     * or more contributor license agreements.  See the NOTICE file
004     * distributed with this work for additional information
005     * regarding copyright ownership.  The ASF licenses this file
006     * to you under the Apache License, Version 2.0 (the
007     * "License"); you may not use this file except in compliance
008     * with the License.  You may obtain a copy of the License at
009     *
010     *     http://www.apache.org/licenses/LICENSE-2.0
011     *
012     * Unless required by applicable law or agreed to in writing, software
013     * distributed under the License is distributed on an "AS IS" BASIS,
014     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015     * See the License for the specific language governing permissions and
016     * limitations under the License.
017     */
018    
019    package org.apache.hadoop.record;
020    
021    import java.io.DataInput;
022    import java.io.DataOutput;
023    import java.io.IOException;
024    
025    import org.apache.hadoop.classification.InterfaceAudience;
026    import org.apache.hadoop.classification.InterfaceStability;
027    import org.apache.hadoop.io.WritableComparator;
028    import org.apache.hadoop.io.WritableUtils;
029    
030    /**
031     * Various utility functions for Hadooop record I/O runtime.
032     * 
033     * @deprecated Replaced by <a href="https://hadoop.apache.org/avro/">Avro</a>.
034     */
035    @Deprecated
036    @InterfaceAudience.Public
037    @InterfaceStability.Stable
038    public class Utils {
039      
040      /** Cannot create a new instance of Utils */
041      private Utils() {
042      }
043      
044      public static final char[] hexchars = { '0', '1', '2', '3', '4', '5',
045                                              '6', '7', '8', '9', 'A', 'B',
046                                              'C', 'D', 'E', 'F' };
047      /**
048       *
049       * @param s
050       * @return
051       */
052      static String toXMLString(String s) {
053        StringBuilder sb = new StringBuilder();
054        for (int idx = 0; idx < s.length(); idx++) {
055          char ch = s.charAt(idx);
056          if (ch == '<') {
057            sb.append("&lt;");
058          } else if (ch == '&') {
059            sb.append("&amp;");
060          } else if (ch == '%') {
061            sb.append("%0025");
062          } else if (ch < 0x20 ||
063                     (ch > 0xD7FF && ch < 0xE000) ||
064                     (ch > 0xFFFD)) {
065            sb.append("%");
066            sb.append(hexchars[(ch & 0xF000) >> 12]);
067            sb.append(hexchars[(ch & 0x0F00) >> 8]);
068            sb.append(hexchars[(ch & 0x00F0) >> 4]);
069            sb.append(hexchars[(ch & 0x000F)]);
070          } else {
071            sb.append(ch);
072          }
073        }
074        return sb.toString();
075      }
076      
077      static private int h2c(char ch) {
078        if (ch >= '0' && ch <= '9') {
079          return ch - '0';
080        } else if (ch >= 'A' && ch <= 'F') {
081          return ch - 'A' + 10;
082        } else if (ch >= 'a' && ch <= 'f') {
083          return ch - 'a' + 10;
084        }
085        return 0;
086      }
087      
088      /**
089       *
090       * @param s
091       * @return
092       */
093      static String fromXMLString(String s) {
094        StringBuilder sb = new StringBuilder();
095        for (int idx = 0; idx < s.length();) {
096          char ch = s.charAt(idx++);
097          if (ch == '%') {
098            int ch1 = h2c(s.charAt(idx++)) << 12;
099            int ch2 = h2c(s.charAt(idx++)) << 8;
100            int ch3 = h2c(s.charAt(idx++)) << 4;
101            int ch4 = h2c(s.charAt(idx++));
102            char res = (char)(ch1 | ch2 | ch3 | ch4);
103            sb.append(res);
104          } else {
105            sb.append(ch);
106          }
107        }
108        return sb.toString();
109      }
110      
111      /**
112       *
113       * @param s
114       * @return
115       */
116      static String toCSVString(String s) {
117        StringBuilder sb = new StringBuilder(s.length()+1);
118        sb.append('\'');
119        int len = s.length();
120        for (int i = 0; i < len; i++) {
121          char c = s.charAt(i);
122          switch(c) {
123          case '\0':
124            sb.append("%00");
125            break;
126          case '\n':
127            sb.append("%0A");
128            break;
129          case '\r':
130            sb.append("%0D");
131            break;
132          case ',':
133            sb.append("%2C");
134            break;
135          case '}':
136            sb.append("%7D");
137            break;
138          case '%':
139            sb.append("%25");
140            break;
141          default:
142            sb.append(c);
143          }
144        }
145        return sb.toString();
146      }
147      
148      /**
149       *
150       * @param s
151       * @throws java.io.IOException
152       * @return
153       */
154      static String fromCSVString(String s) throws IOException {
155        if (s.charAt(0) != '\'') {
156          throw new IOException("Error deserializing string.");
157        }
158        int len = s.length();
159        StringBuilder sb = new StringBuilder(len-1);
160        for (int i = 1; i < len; i++) {
161          char c = s.charAt(i);
162          if (c == '%') {
163            char ch1 = s.charAt(i+1);
164            char ch2 = s.charAt(i+2);
165            i += 2;
166            if (ch1 == '0' && ch2 == '0') {
167              sb.append('\0');
168            } else if (ch1 == '0' && ch2 == 'A') {
169              sb.append('\n');
170            } else if (ch1 == '0' && ch2 == 'D') {
171              sb.append('\r');
172            } else if (ch1 == '2' && ch2 == 'C') {
173              sb.append(',');
174            } else if (ch1 == '7' && ch2 == 'D') {
175              sb.append('}');
176            } else if (ch1 == '2' && ch2 == '5') {
177              sb.append('%');
178            } else {
179              throw new IOException("Error deserializing string.");
180            }
181          } else {
182            sb.append(c);
183          }
184        }
185        return sb.toString();
186      }
187      
188      /**
189       *
190       * @param s
191       * @return
192       */
193      static String toXMLBuffer(Buffer s) {
194        return s.toString();
195      }
196      
197      /**
198       *
199       * @param s
200       * @throws java.io.IOException
201       * @return
202       */
203      static Buffer fromXMLBuffer(String s)
204        throws IOException {
205        if (s.length() == 0) { return new Buffer(); }
206        int blen = s.length()/2;
207        byte[] barr = new byte[blen];
208        for (int idx = 0; idx < blen; idx++) {
209          char c1 = s.charAt(2*idx);
210          char c2 = s.charAt(2*idx+1);
211          barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16);
212        }
213        return new Buffer(barr);
214      }
215      
216      /**
217       *
218       * @param buf
219       * @return
220       */
221      static String toCSVBuffer(Buffer buf) {
222        StringBuilder sb = new StringBuilder("#");
223        sb.append(buf.toString());
224        return sb.toString();
225      }
226      
227      /**
228       * Converts a CSV-serialized representation of buffer to a new
229       * Buffer
230       * @param s CSV-serialized representation of buffer
231       * @throws java.io.IOException
232       * @return Deserialized Buffer
233       */
234      static Buffer fromCSVBuffer(String s)
235        throws IOException {
236        if (s.charAt(0) != '#') {
237          throw new IOException("Error deserializing buffer.");
238        }
239        if (s.length() == 1) { return new Buffer(); }
240        int blen = (s.length()-1)/2;
241        byte[] barr = new byte[blen];
242        for (int idx = 0; idx < blen; idx++) {
243          char c1 = s.charAt(2*idx+1);
244          char c2 = s.charAt(2*idx+2);
245          barr[idx] = (byte)Integer.parseInt(""+c1+c2, 16);
246        }
247        return new Buffer(barr);
248      }
249      
250      private static int utf8LenForCodePoint(final int cpt) throws IOException {
251        if (cpt >=0 && cpt <= 0x7F) {
252          return 1;
253        }
254        if (cpt >= 0x80 && cpt <= 0x07FF) {
255          return 2;
256        }
257        if ((cpt >= 0x0800 && cpt < 0xD800) ||
258            (cpt > 0xDFFF && cpt <= 0xFFFD)) {
259          return 3;
260        }
261        if (cpt >= 0x10000 && cpt <= 0x10FFFF) {
262          return 4;
263        }
264        throw new IOException("Illegal Unicode Codepoint "+
265                              Integer.toHexString(cpt)+" in string.");
266      }
267      
268      private static final int B10 =    Integer.parseInt("10000000", 2);
269      private static final int B110 =   Integer.parseInt("11000000", 2);
270      private static final int B1110 =  Integer.parseInt("11100000", 2);
271      private static final int B11110 = Integer.parseInt("11110000", 2);
272      private static final int B11 =    Integer.parseInt("11000000", 2);
273      private static final int B111 =   Integer.parseInt("11100000", 2);
274      private static final int B1111 =  Integer.parseInt("11110000", 2);
275      private static final int B11111 = Integer.parseInt("11111000", 2);
276      
277      private static int writeUtf8(int cpt, final byte[] bytes, final int offset)
278        throws IOException {
279        if (cpt >=0 && cpt <= 0x7F) {
280          bytes[offset] = (byte) cpt;
281          return 1;
282        }
283        if (cpt >= 0x80 && cpt <= 0x07FF) {
284          bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
285          cpt = cpt >> 6;
286          bytes[offset] = (byte) (B110 | (cpt & 0x1F));
287          return 2;
288        }
289        if ((cpt >= 0x0800 && cpt < 0xD800) ||
290            (cpt > 0xDFFF && cpt <= 0xFFFD)) {
291          bytes[offset+2] = (byte) (B10 | (cpt & 0x3F));
292          cpt = cpt >> 6;
293          bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
294          cpt = cpt >> 6;
295          bytes[offset] = (byte) (B1110 | (cpt & 0x0F));
296          return 3;
297        }
298        if (cpt >= 0x10000 && cpt <= 0x10FFFF) {
299          bytes[offset+3] = (byte) (B10 | (cpt & 0x3F));
300          cpt = cpt >> 6;
301          bytes[offset+2] = (byte) (B10 | (cpt & 0x3F));
302          cpt = cpt >> 6;
303          bytes[offset+1] = (byte) (B10 | (cpt & 0x3F));
304          cpt = cpt >> 6;
305          bytes[offset] = (byte) (B11110 | (cpt & 0x07));
306          return 4;
307        }
308        throw new IOException("Illegal Unicode Codepoint "+
309                              Integer.toHexString(cpt)+" in string.");
310      }
311      
312      static void toBinaryString(final DataOutput out, final String str)
313        throws IOException {
314        final int strlen = str.length();
315        byte[] bytes = new byte[strlen*4]; // Codepoints expand to 4 bytes max
316        int utf8Len = 0;
317        int idx = 0;
318        while(idx < strlen) {
319          final int cpt = str.codePointAt(idx);
320          idx += Character.isSupplementaryCodePoint(cpt) ? 2 : 1;
321          utf8Len += writeUtf8(cpt, bytes, utf8Len);
322        }
323        writeVInt(out, utf8Len);
324        out.write(bytes, 0, utf8Len);
325      }
326      
327      static boolean isValidCodePoint(int cpt) {
328        return !((cpt > 0x10FFFF) ||
329                 (cpt >= 0xD800 && cpt <= 0xDFFF) ||
330                 (cpt >= 0xFFFE && cpt <=0xFFFF));
331      }
332      
333      private static int utf8ToCodePoint(int b1, int b2, int b3, int b4) {
334        int cpt = 0;
335        cpt = (((b1 & ~B11111) << 18) |
336               ((b2 & ~B11) << 12) |
337               ((b3 & ~B11) << 6) |
338               (b4 & ~B11));
339        return cpt;
340      }
341      
342      private static int utf8ToCodePoint(int b1, int b2, int b3) {
343        int cpt = 0;
344        cpt = (((b1 & ~B1111) << 12) | ((b2 & ~B11) << 6) | (b3 & ~B11));
345        return cpt;
346      }
347      
348      private static int utf8ToCodePoint(int b1, int b2) {
349        int cpt = 0;
350        cpt = (((b1 & ~B111) << 6) | (b2 & ~B11));
351        return cpt;
352      }
353      
354      private static void checkB10(int b) throws IOException {
355        if ((b & B11) != B10) {
356          throw new IOException("Invalid UTF-8 representation.");
357        }
358      }
359      
360      static String fromBinaryString(final DataInput din) throws IOException {
361        final int utf8Len = readVInt(din);
362        final byte[] bytes = new byte[utf8Len];
363        din.readFully(bytes);
364        int len = 0;
365        // For the most commmon case, i.e. ascii, numChars = utf8Len
366        StringBuilder sb = new StringBuilder(utf8Len);
367        while(len < utf8Len) {
368          int cpt = 0;
369          final int b1 = bytes[len++] & 0xFF;
370          if (b1 <= 0x7F) {
371            cpt = b1;
372          } else if ((b1 & B11111) == B11110) {
373            int b2 = bytes[len++] & 0xFF;
374            checkB10(b2);
375            int b3 = bytes[len++] & 0xFF;
376            checkB10(b3);
377            int b4 = bytes[len++] & 0xFF;
378            checkB10(b4);
379            cpt = utf8ToCodePoint(b1, b2, b3, b4);
380          } else if ((b1 & B1111) == B1110) {
381            int b2 = bytes[len++] & 0xFF;
382            checkB10(b2);
383            int b3 = bytes[len++] & 0xFF;
384            checkB10(b3);
385            cpt = utf8ToCodePoint(b1, b2, b3);
386          } else if ((b1 & B111) == B110) {
387            int b2 = bytes[len++] & 0xFF;
388            checkB10(b2);
389            cpt = utf8ToCodePoint(b1, b2);
390          } else {
391            throw new IOException("Invalid UTF-8 byte "+Integer.toHexString(b1)+
392                                  " at offset "+(len-1)+" in length of "+utf8Len);
393          }
394          if (!isValidCodePoint(cpt)) {
395            throw new IOException("Illegal Unicode Codepoint "+
396                                  Integer.toHexString(cpt)+" in stream.");
397          }
398          sb.appendCodePoint(cpt);
399        }
400        return sb.toString();
401      }
402      
403      /** Parse a float from a byte array. */
404      public static float readFloat(byte[] bytes, int start) {
405        return WritableComparator.readFloat(bytes, start);
406      }
407      
408      /** Parse a double from a byte array. */
409      public static double readDouble(byte[] bytes, int start) {
410        return WritableComparator.readDouble(bytes, start);
411      }
412      
413      /**
414       * Reads a zero-compressed encoded long from a byte array and returns it.
415       * @param bytes byte array with decode long
416       * @param start starting index
417       * @throws java.io.IOException
418       * @return deserialized long
419       */
420      public static long readVLong(byte[] bytes, int start) throws IOException {
421        return WritableComparator.readVLong(bytes, start);
422      }
423      
424      /**
425       * Reads a zero-compressed encoded integer from a byte array and returns it.
426       * @param bytes byte array with the encoded integer
427       * @param start start index
428       * @throws java.io.IOException
429       * @return deserialized integer
430       */
431      public static int readVInt(byte[] bytes, int start) throws IOException {
432        return WritableComparator.readVInt(bytes, start);
433      }
434      
435      /**
436       * Reads a zero-compressed encoded long from a stream and return it.
437       * @param in input stream
438       * @throws java.io.IOException
439       * @return deserialized long
440       */
441      public static long readVLong(DataInput in) throws IOException {
442        return WritableUtils.readVLong(in);
443      }
444      
445      /**
446       * Reads a zero-compressed encoded integer from a stream and returns it.
447       * @param in input stream
448       * @throws java.io.IOException
449       * @return deserialized integer
450       */
451      public static int readVInt(DataInput in) throws IOException {
452        return WritableUtils.readVInt(in);
453      }
454      
455      /**
456       * Get the encoded length if an integer is stored in a variable-length format
457       * @return the encoded length
458       */
459      public static int getVIntSize(long i) {
460        return WritableUtils.getVIntSize(i);
461      }
462      
463      /**
464       * Serializes a long to a binary stream with zero-compressed encoding.
465       * For -112 <= i <= 127, only one byte is used with the actual value.
466       * For other values of i, the first byte value indicates whether the
467       * long is positive or negative, and the number of bytes that follow.
468       * If the first byte value v is between -113 and -120, the following long
469       * is positive, with number of bytes that follow are -(v+112).
470       * If the first byte value v is between -121 and -128, the following long
471       * is negative, with number of bytes that follow are -(v+120). Bytes are
472       * stored in the high-non-zero-byte-first order.
473       *
474       * @param stream Binary output stream
475       * @param i Long to be serialized
476       * @throws java.io.IOException
477       */
478      public static void writeVLong(DataOutput stream, long i) throws IOException {
479        WritableUtils.writeVLong(stream, i);
480      }
481      
482      /**
483       * Serializes an int to a binary stream with zero-compressed encoding.
484       *
485       * @param stream Binary output stream
486       * @param i int to be serialized
487       * @throws java.io.IOException
488       */
489      public static void writeVInt(DataOutput stream, int i) throws IOException {
490        WritableUtils.writeVInt(stream, i);
491      }
492      
493      /** Lexicographic order of binary data. */
494      public static int compareBytes(byte[] b1, int s1, int l1,
495                                     byte[] b2, int s2, int l2) {
496        return WritableComparator.compareBytes(b1, s1, l1, b2, s2, l2);
497      }
498    }