001/**
002 * Licensed to the Apache Software Foundation (ASF) under one
003 * or more contributor license agreements.  See the NOTICE file
004 * distributed with this work for additional information
005 * regarding copyright ownership.  The ASF licenses this file
006 * to you under the Apache License, Version 2.0 (the
007 * "License"); you may not use this file except in compliance
008 * with the License.  You may obtain a copy of the License at
009 *
010 *     http://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018
019package org.apache.hadoop.hdfs.util;
020
021import org.apache.hadoop.classification.InterfaceAudience;
022import org.apache.hadoop.classification.InterfaceStability;
023import org.xml.sax.ContentHandler;
024import org.xml.sax.SAXException;
025import org.xml.sax.helpers.AttributesImpl;
026
027import java.util.LinkedList;
028import java.util.List;
029import java.util.Map;
030import java.util.TreeMap;
031
032/**
033 * General xml utilities.
034 *   
035 */
036@InterfaceAudience.Private
037@InterfaceStability.Unstable
038public class XMLUtils {
039  /**
040   * Exception that reflects an invalid XML document.
041   */
042  static public class InvalidXmlException extends RuntimeException {
043    private static final long serialVersionUID = 1L;
044    public InvalidXmlException(String s) {
045      super(s);
046    }
047  }
048  
049  /**
050   * Exception that reflects a string that cannot be unmangled.
051   */
052  public static class UnmanglingError extends RuntimeException {
053    private static final long serialVersionUID = 1L;
054    
055    public UnmanglingError(String str, Exception e) {
056      super(str, e);
057    }
058    
059    public UnmanglingError(String str) {
060      super(str);
061    }
062  }
063  
064
065  /**
066   * Given a code point, determine if it should be mangled before being
067   * represented in an XML document.
068   * 
069   * Any code point that isn't valid in XML must be mangled.
070   * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a
071   * quick reference, or the w3 standard for the authoritative reference.
072   * 
073   * @param cp      The code point
074   * @return        True if the code point should be mangled
075   */
076  private static boolean codePointMustBeMangled(int cp) {
077    if (cp < 0x20) {
078      return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd));
079    } else if ((0xd7ff < cp) && (cp < 0xe000)) {
080      return true;
081    } else if ((cp == 0xfffe) || (cp == 0xffff)) {
082      return true;
083    } else if (cp == 0x5c) {
084      // we mangle backslash to simplify decoding... it's
085      // easier if backslashes always begin mangled sequences. 
086      return true;
087    }
088    return false;
089  }
090
091  private static final int NUM_SLASH_POSITIONS = 4;
092
093  private static String mangleCodePoint(int cp) {
094    return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp);
095  }
096
097  private static String codePointToEntityRef(int cp) {
098    switch (cp) {
099      case '&':
100        return "&amp;";
101      case '\"':
102        return "&quot;";
103      case '\'':
104        return "&apos;";
105      case '<':
106        return "&lt;";
107      case '>':
108        return "&gt;";
109      default:
110        return null;
111    }
112  }
113
114  /**
115   * Mangle a string so that it can be represented in an XML document.
116   * 
117   * There are three kinds of code points in XML:
118   * - Those that can be represented normally,
119   * - Those that have to be escaped (for example, & must be represented 
120   *     as &amp;)
121   * - Those that cannot be represented at all in XML.
122   *
123   * The built-in SAX functions will handle the first two types for us just
124   * fine.  However, sometimes we come across a code point of the third type.
125   * In this case, we have to mangle the string in order to represent it at
126   * all.  We also mangle backslash to avoid confusing a backslash in the
127   * string with part our escape sequence.
128   * 
129   * The encoding used here is as follows: an illegal code point is
130   * represented as '\ABCD;', where ABCD is the hexadecimal value of 
131   * the code point.
132   *
133   * @param str     The input string.
134   *
135   * @return        The mangled string.
136   */
137  public static String mangleXmlString(String str, boolean createEntityRefs) {
138    final StringBuilder bld = new StringBuilder();
139    final int length = str.length();
140    for (int offset = 0; offset < length; ) {
141       final int cp = str.codePointAt(offset);
142       final int len = Character.charCount(cp);
143       if (codePointMustBeMangled(cp)) {
144         bld.append(mangleCodePoint(cp));
145       } else {
146         String entityRef = null;
147         if (createEntityRefs) {
148           entityRef = codePointToEntityRef(cp);
149         }
150         if (entityRef != null) {
151           bld.append(entityRef);
152         } else {
153           for (int i = 0; i < len; i++) {
154             bld.append(str.charAt(offset + i));
155           }
156         }
157       }
158       offset += len;
159    }
160    return bld.toString();
161  }
162
163  /**
164   * Demangle a string from an XML document.
165   * See {@link #mangleXmlString(String, boolean)} for a description of the
166   * mangling format.
167   *
168   * @param str    The string to be demangled.
169   * 
170   * @return       The unmangled string
171   * @throws       UnmanglingError if the input is malformed.
172   */
173  public static String unmangleXmlString(String str, boolean decodeEntityRefs)
174        throws UnmanglingError {
175    int slashPosition = -1;
176    String escapedCp = "";
177    StringBuilder bld = new StringBuilder();
178    StringBuilder entityRef = null;
179    for (int i = 0; i < str.length(); i++) {
180      char ch = str.charAt(i);
181      if (entityRef != null) {
182        entityRef.append(ch);
183        if (ch == ';') {
184          String e = entityRef.toString();
185          if (e.equals("&quot;")) {
186            bld.append("\"");
187          } else if (e.equals("&apos;")) {
188            bld.append("\'");
189          } else if (e.equals("&amp;")) {
190            bld.append("&");
191          } else if (e.equals("&lt;")) {
192            bld.append("<");
193          } else if (e.equals("&gt;")) {
194            bld.append(">");
195          } else {
196            throw new UnmanglingError("Unknown entity ref " + e);
197          }
198          entityRef = null;
199        }
200      } else  if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) {
201        escapedCp += ch;
202        ++slashPosition;
203      } else if (slashPosition == NUM_SLASH_POSITIONS) {
204        if (ch != ';') {
205          throw new UnmanglingError("unterminated code point escape: " +
206              "expected semicolon at end.");
207        }
208        try {
209          bld.appendCodePoint(Integer.parseInt(escapedCp, 16));
210        } catch (NumberFormatException e) {
211          throw new UnmanglingError("error parsing unmangling escape code", e);
212        }
213        escapedCp = "";
214        slashPosition = -1;
215      } else if (ch == '\\') {
216        slashPosition = 0;
217      } else {
218        boolean startingEntityRef = false;
219        if (decodeEntityRefs) {
220          startingEntityRef = (ch == '&');
221        }
222        if (startingEntityRef) {
223          entityRef = new StringBuilder();
224          entityRef.append("&");
225        } else {
226          bld.append(ch);
227        }
228      }
229    }
230    if (entityRef != null) {
231      throw new UnmanglingError("unterminated entity ref starting with " +
232          entityRef.toString());
233    } else if (slashPosition != -1) {
234      throw new UnmanglingError("unterminated code point escape: string " +
235          "broke off in the middle");
236    }
237    return bld.toString();
238  }
239  
240  /**
241   * Add a SAX tag with a string inside.
242   *
243   * @param contentHandler     the SAX content handler
244   * @param tag                the element tag to use  
245   * @param val                the string to put inside the tag
246   */
247  public static void addSaxString(ContentHandler contentHandler,
248      String tag, String val) throws SAXException {
249    contentHandler.startElement("", "", tag, new AttributesImpl());
250    char c[] = mangleXmlString(val, false).toCharArray();
251    contentHandler.characters(c, 0, c.length);
252    contentHandler.endElement("", "", tag);
253  }
254
255  /**
256   * Represents a bag of key-value pairs encountered during parsing an XML
257   * file.
258   */
259  static public class Stanza {
260    private final TreeMap<String, LinkedList <Stanza > > subtrees;
261
262    /** The unmangled value of this stanza. */
263    private String value;
264    
265    public Stanza() {
266      subtrees = new TreeMap<String, LinkedList <Stanza > >();
267      value = "";
268    }
269    
270    public void setValue(String value) {
271      this.value = value;
272    }
273    
274    public String getValue() {
275      return this.value;
276    }
277    
278    /** 
279     * Discover if a stanza has a given entry.
280     *
281     * @param name        entry to look for
282     * 
283     * @return            true if the entry was found
284     */
285    public boolean hasChildren(String name) {
286      return subtrees.containsKey(name);
287    }
288    
289    /** 
290     * Pull an entry from a stanza.
291     *
292     * @param name        entry to look for
293     * 
294     * @return            the entry
295     */
296    public List<Stanza> getChildren(String name) throws InvalidXmlException {
297      LinkedList <Stanza> children = subtrees.get(name);
298      if (children == null) {
299        throw new InvalidXmlException("no entry found for " + name);
300      }
301      return children;
302    }
303    
304    /** 
305     * Pull a string entry from a stanza.
306     *
307     * @param name        entry to look for
308     * 
309     * @return            the entry
310     */
311    public String getValue(String name) throws InvalidXmlException {
312      String ret = getValueOrNull(name);
313      if (ret == null) {
314        throw new InvalidXmlException("no entry found for " + name);
315      }
316      return ret;
317    }
318
319    /** 
320     * Pull a string entry from a stanza, or null.
321     *
322     * @param name        entry to look for
323     * 
324     * @return            the entry, or null if it was not found.
325     */
326    public String getValueOrNull(String name) throws InvalidXmlException {
327      if (!subtrees.containsKey(name)) {
328        return null;
329      }
330      LinkedList <Stanza> l = subtrees.get(name);
331      if (l.size() != 1) {
332        throw new InvalidXmlException("More than one value found for " + name);
333      }
334      return l.get(0).getValue();
335    }
336    
337    /** 
338     * Add an entry to a stanza.
339     *
340     * @param name        name of the entry to add
341     * @param child       the entry to add
342     */
343    public void addChild(String name, Stanza child) {
344      LinkedList<Stanza> l;
345      if (subtrees.containsKey(name)) {
346        l = subtrees.get(name);
347      } else {
348        l = new LinkedList<Stanza>();
349        subtrees.put(name, l);
350      }
351      l.add(child);
352    }
353    
354    /** 
355     * Convert a stanza to a human-readable string.
356     */
357    @Override
358    public String toString() {
359      StringBuilder bld = new StringBuilder();
360      bld.append("{");
361      if (!value.equals("")) {
362        bld.append("\"").append(value).append("\"");
363      }
364      String prefix = "";
365      for (Map.Entry<String, LinkedList <Stanza > > entry :
366          subtrees.entrySet()) {
367        String key = entry.getKey();
368        LinkedList <Stanza > ll = entry.getValue();
369        for (Stanza child : ll) {
370          bld.append(prefix);
371          bld.append("<").append(key).append(">");
372          bld.append(child.toString());
373          prefix = ", ";
374        }
375      }
376      bld.append("}");
377      return bld.toString();
378    }
379  }
380}