001/** 002 * Licensed to the Apache Software Foundation (ASF) under one 003 * or more contributor license agreements. See the NOTICE file 004 * distributed with this work for additional information 005 * regarding copyright ownership. The ASF licenses this file 006 * to you under the Apache License, Version 2.0 (the 007 * "License"); you may not use this file except in compliance 008 * with the License. You may obtain a copy of the License at 009 * 010 * http://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018 019package org.apache.hadoop.hdfs.util; 020 021import org.apache.hadoop.classification.InterfaceAudience; 022import org.apache.hadoop.classification.InterfaceStability; 023import org.xml.sax.ContentHandler; 024import org.xml.sax.SAXException; 025import org.xml.sax.helpers.AttributesImpl; 026 027import java.util.LinkedList; 028import java.util.List; 029import java.util.Map; 030import java.util.TreeMap; 031 032/** 033 * General xml utilities. 034 * 035 */ 036@InterfaceAudience.Private 037@InterfaceStability.Unstable 038public class XMLUtils { 039 /** 040 * Exception that reflects an invalid XML document. 041 */ 042 static public class InvalidXmlException extends RuntimeException { 043 private static final long serialVersionUID = 1L; 044 public InvalidXmlException(String s) { 045 super(s); 046 } 047 } 048 049 /** 050 * Exception that reflects a string that cannot be unmangled. 051 */ 052 public static class UnmanglingError extends RuntimeException { 053 private static final long serialVersionUID = 1L; 054 055 public UnmanglingError(String str, Exception e) { 056 super(str, e); 057 } 058 059 public UnmanglingError(String str) { 060 super(str); 061 } 062 } 063 064 065 /** 066 * Given a code point, determine if it should be mangled before being 067 * represented in an XML document. 068 * 069 * Any code point that isn't valid in XML must be mangled. 070 * See http://en.wikipedia.org/wiki/Valid_characters_in_XML for a 071 * quick reference, or the w3 standard for the authoritative reference. 072 * 073 * @param cp The code point 074 * @return True if the code point should be mangled 075 */ 076 private static boolean codePointMustBeMangled(int cp) { 077 if (cp < 0x20) { 078 return ((cp != 0x9) && (cp != 0xa) && (cp != 0xd)); 079 } else if ((0xd7ff < cp) && (cp < 0xe000)) { 080 return true; 081 } else if ((cp == 0xfffe) || (cp == 0xffff)) { 082 return true; 083 } else if (cp == 0x5c) { 084 // we mangle backslash to simplify decoding... it's 085 // easier if backslashes always begin mangled sequences. 086 return true; 087 } 088 return false; 089 } 090 091 private static final int NUM_SLASH_POSITIONS = 4; 092 093 private static String mangleCodePoint(int cp) { 094 return String.format("\\%0" + NUM_SLASH_POSITIONS + "x;", cp); 095 } 096 097 private static String codePointToEntityRef(int cp) { 098 switch (cp) { 099 case '&': 100 return "&"; 101 case '\"': 102 return """; 103 case '\'': 104 return "'"; 105 case '<': 106 return "<"; 107 case '>': 108 return ">"; 109 default: 110 return null; 111 } 112 } 113 114 /** 115 * Mangle a string so that it can be represented in an XML document. 116 * 117 * There are three kinds of code points in XML: 118 * - Those that can be represented normally, 119 * - Those that have to be escaped (for example, & must be represented 120 * as &) 121 * - Those that cannot be represented at all in XML. 122 * 123 * The built-in SAX functions will handle the first two types for us just 124 * fine. However, sometimes we come across a code point of the third type. 125 * In this case, we have to mangle the string in order to represent it at 126 * all. We also mangle backslash to avoid confusing a backslash in the 127 * string with part our escape sequence. 128 * 129 * The encoding used here is as follows: an illegal code point is 130 * represented as '\ABCD;', where ABCD is the hexadecimal value of 131 * the code point. 132 * 133 * @param str The input string. 134 * 135 * @return The mangled string. 136 */ 137 public static String mangleXmlString(String str, boolean createEntityRefs) { 138 final StringBuilder bld = new StringBuilder(); 139 final int length = str.length(); 140 for (int offset = 0; offset < length; ) { 141 final int cp = str.codePointAt(offset); 142 final int len = Character.charCount(cp); 143 if (codePointMustBeMangled(cp)) { 144 bld.append(mangleCodePoint(cp)); 145 } else { 146 String entityRef = null; 147 if (createEntityRefs) { 148 entityRef = codePointToEntityRef(cp); 149 } 150 if (entityRef != null) { 151 bld.append(entityRef); 152 } else { 153 for (int i = 0; i < len; i++) { 154 bld.append(str.charAt(offset + i)); 155 } 156 } 157 } 158 offset += len; 159 } 160 return bld.toString(); 161 } 162 163 /** 164 * Demangle a string from an XML document. 165 * See {@link #mangleXmlString(String, boolean)} for a description of the 166 * mangling format. 167 * 168 * @param str The string to be demangled. 169 * 170 * @return The unmangled string 171 * @throws UnmanglingError if the input is malformed. 172 */ 173 public static String unmangleXmlString(String str, boolean decodeEntityRefs) 174 throws UnmanglingError { 175 int slashPosition = -1; 176 String escapedCp = ""; 177 StringBuilder bld = new StringBuilder(); 178 StringBuilder entityRef = null; 179 for (int i = 0; i < str.length(); i++) { 180 char ch = str.charAt(i); 181 if (entityRef != null) { 182 entityRef.append(ch); 183 if (ch == ';') { 184 String e = entityRef.toString(); 185 if (e.equals(""")) { 186 bld.append("\""); 187 } else if (e.equals("'")) { 188 bld.append("\'"); 189 } else if (e.equals("&")) { 190 bld.append("&"); 191 } else if (e.equals("<")) { 192 bld.append("<"); 193 } else if (e.equals(">")) { 194 bld.append(">"); 195 } else { 196 throw new UnmanglingError("Unknown entity ref " + e); 197 } 198 entityRef = null; 199 } 200 } else if ((slashPosition >= 0) && (slashPosition < NUM_SLASH_POSITIONS)) { 201 escapedCp += ch; 202 ++slashPosition; 203 } else if (slashPosition == NUM_SLASH_POSITIONS) { 204 if (ch != ';') { 205 throw new UnmanglingError("unterminated code point escape: " + 206 "expected semicolon at end."); 207 } 208 try { 209 bld.appendCodePoint(Integer.parseInt(escapedCp, 16)); 210 } catch (NumberFormatException e) { 211 throw new UnmanglingError("error parsing unmangling escape code", e); 212 } 213 escapedCp = ""; 214 slashPosition = -1; 215 } else if (ch == '\\') { 216 slashPosition = 0; 217 } else { 218 boolean startingEntityRef = false; 219 if (decodeEntityRefs) { 220 startingEntityRef = (ch == '&'); 221 } 222 if (startingEntityRef) { 223 entityRef = new StringBuilder(); 224 entityRef.append("&"); 225 } else { 226 bld.append(ch); 227 } 228 } 229 } 230 if (entityRef != null) { 231 throw new UnmanglingError("unterminated entity ref starting with " + 232 entityRef.toString()); 233 } else if (slashPosition != -1) { 234 throw new UnmanglingError("unterminated code point escape: string " + 235 "broke off in the middle"); 236 } 237 return bld.toString(); 238 } 239 240 /** 241 * Add a SAX tag with a string inside. 242 * 243 * @param contentHandler the SAX content handler 244 * @param tag the element tag to use 245 * @param val the string to put inside the tag 246 */ 247 public static void addSaxString(ContentHandler contentHandler, 248 String tag, String val) throws SAXException { 249 contentHandler.startElement("", "", tag, new AttributesImpl()); 250 char c[] = mangleXmlString(val, false).toCharArray(); 251 contentHandler.characters(c, 0, c.length); 252 contentHandler.endElement("", "", tag); 253 } 254 255 /** 256 * Represents a bag of key-value pairs encountered during parsing an XML 257 * file. 258 */ 259 static public class Stanza { 260 private final TreeMap<String, LinkedList <Stanza > > subtrees; 261 262 /** The unmangled value of this stanza. */ 263 private String value; 264 265 public Stanza() { 266 subtrees = new TreeMap<String, LinkedList <Stanza > >(); 267 value = ""; 268 } 269 270 public void setValue(String value) { 271 this.value = value; 272 } 273 274 public String getValue() { 275 return this.value; 276 } 277 278 /** 279 * Discover if a stanza has a given entry. 280 * 281 * @param name entry to look for 282 * 283 * @return true if the entry was found 284 */ 285 public boolean hasChildren(String name) { 286 return subtrees.containsKey(name); 287 } 288 289 /** 290 * Pull an entry from a stanza. 291 * 292 * @param name entry to look for 293 * 294 * @return the entry 295 */ 296 public List<Stanza> getChildren(String name) throws InvalidXmlException { 297 LinkedList <Stanza> children = subtrees.get(name); 298 if (children == null) { 299 throw new InvalidXmlException("no entry found for " + name); 300 } 301 return children; 302 } 303 304 /** 305 * Pull a string entry from a stanza. 306 * 307 * @param name entry to look for 308 * 309 * @return the entry 310 */ 311 public String getValue(String name) throws InvalidXmlException { 312 String ret = getValueOrNull(name); 313 if (ret == null) { 314 throw new InvalidXmlException("no entry found for " + name); 315 } 316 return ret; 317 } 318 319 /** 320 * Pull a string entry from a stanza, or null. 321 * 322 * @param name entry to look for 323 * 324 * @return the entry, or null if it was not found. 325 */ 326 public String getValueOrNull(String name) throws InvalidXmlException { 327 if (!subtrees.containsKey(name)) { 328 return null; 329 } 330 LinkedList <Stanza> l = subtrees.get(name); 331 if (l.size() != 1) { 332 throw new InvalidXmlException("More than one value found for " + name); 333 } 334 return l.get(0).getValue(); 335 } 336 337 /** 338 * Add an entry to a stanza. 339 * 340 * @param name name of the entry to add 341 * @param child the entry to add 342 */ 343 public void addChild(String name, Stanza child) { 344 LinkedList<Stanza> l; 345 if (subtrees.containsKey(name)) { 346 l = subtrees.get(name); 347 } else { 348 l = new LinkedList<Stanza>(); 349 subtrees.put(name, l); 350 } 351 l.add(child); 352 } 353 354 /** 355 * Convert a stanza to a human-readable string. 356 */ 357 @Override 358 public String toString() { 359 StringBuilder bld = new StringBuilder(); 360 bld.append("{"); 361 if (!value.equals("")) { 362 bld.append("\"").append(value).append("\""); 363 } 364 String prefix = ""; 365 for (Map.Entry<String, LinkedList <Stanza > > entry : 366 subtrees.entrySet()) { 367 String key = entry.getKey(); 368 LinkedList <Stanza > ll = entry.getValue(); 369 for (Stanza child : ll) { 370 bld.append(prefix); 371 bld.append("<").append(key).append(">"); 372 bld.append(child.toString()); 373 prefix = ", "; 374 } 375 } 376 bld.append("}"); 377 return bld.toString(); 378 } 379 } 380}