Package pyxmpp :: Module xmlextra
[hide private]

Source Code for Module pyxmpp.xmlextra

  1  # 
  2  # (C) Copyright 2003-2010 Jacek Konieczny <jajcus@jajcus.net> 
  3  # 
  4  # This program is free software; you can redistribute it and/or modify 
  5  # it under the terms of the GNU Lesser General Public License Version 
  6  # 2.1 as published by the Free Software Foundation. 
  7  # 
  8  # This program is distributed in the hope that it will be useful, 
  9  # but WITHOUT ANY WARRANTY; without even the implied warranty of 
 10  # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
 11  # GNU Lesser General Public License for more details. 
 12  # 
 13  # You should have received a copy of the GNU Lesser General Public 
 14  # License along with this program; if not, write to the Free Software 
 15  # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 
 16  # 
 17  # pylint: disable-msg=C0103, W0132, W0611 
 18   
 19  """Extension to libxml2 for XMPP stream and stanza processing""" 
 20   
 21  __docformat__="restructuredtext en" 
 22   
 23  import sys 
 24  import libxml2 
 25  import threading 
 26  import re 
 27  import logging 
 28   
 29  from pyxmpp.exceptions import StreamParseError 
 30   
 31  common_doc = libxml2.newDoc("1.0") 
 32  common_root = common_doc.newChild(None,"root",None) 
 33  COMMON_NS = "http://pyxmpp.jajcus.net/xmlns/common" 
 34  common_ns = common_root.newNs(COMMON_NS, None) 
 35  common_root.setNs(common_ns) 
 36  common_doc.setRootElement(common_root) 
 37   
 38  logger = logging.getLogger("pyxmpp.xmlextra") 
 39   
40 -class StreamHandler:
41 """Base class for stream handler."""
42 - def __init__(self):
43 pass
44
45 - def _stream_start(self,_doc):
46 """Process stream start.""" 47 doc=libxml2.xmlDoc(_doc) 48 self.stream_start(doc)
49
50 - def _stream_end(self,_doc):
51 """Process stream end.""" 52 doc=libxml2.xmlDoc(_doc) 53 self.stream_end(doc)
54
55 - def _stanza(self,_doc,_node):
56 """Process complete stanza.""" 57 doc=libxml2.xmlDoc(_doc) 58 node=libxml2.xmlNode(_node) 59 self.stanza(doc,node)
60
61 - def stream_start(self,doc):
62 """Called when the start tag of root element is encountered 63 in the stream. 64 65 :Parameters: 66 - `doc`: the document being parsed. 67 :Types: 68 - `doc`: `libxml2.xmlDoc`""" 69 print >>sys.stderr,"Unhandled stream start:",`doc.serialize()`
70
71 - def stream_end(self,doc):
72 """Called when the end tag of root element is encountered 73 in the stream. 74 75 :Parameters: 76 - `doc`: the document being parsed. 77 :Types: 78 - `doc`: `libxml2.xmlDoc`""" 79 print >>sys.stderr,"Unhandled stream end",`doc.serialize()`
80
81 - def stanza(self, _unused, node):
82 """Called when the end tag of a direct child of the root 83 element is encountered in the stream. 84 85 Please note, that node will be removed from the document 86 and freed after this method returns. If it is needed after 87 that a copy must be made before the method returns. 88 89 :Parameters: 90 - `_unused`: the document being parsed. 91 - `node`: the (complete) element being processed 92 :Types: 93 - `_unused`: `libxml2.xmlDoc` 94 - `node`: `libxml2.xmlNode`""" 95 print >>sys.stderr,"Unhandled stanza",`node.serialize()`
96
97 - def error(self,descr):
98 """Called when an error is encountered in the stream. 99 100 :Parameters: 101 - `descr`: description of the error 102 :Types: 103 - `descr`: `str`""" 104 raise StreamParseError,descr
105
106 - def warning(self,desc):
107 """Called when an warning is encountered in the stream. 108 109 :Parameters: 110 - `descr`: description of the warning 111 :Types: 112 - `descr`: `str`""" 113 # we know vcard-temp is bad... 114 if desc.startswith('xmlns: URI vcard-temp is not absolute'): 115 return 116 # this is also bad... 117 if desc.startswith('xmlns: http://www.xmpp.org/extensions/xep-0084.html#'): 118 return 119 logger.warning("XML STREAM WARNING: {0}".format(desc))
120 121 try: 122 ######################################################################### 123 # C-extension based workarounds for libxml2 limitations 124 #------------------------------------------------------- 125 from pyxmpp import _xmlextra 126 from pyxmpp._xmlextra import error 127 128 _create_reader = _xmlextra.sax_reader_new 129
130 - def replace_ns(node, old_ns,new_ns):
131 """Replace namespaces in a whole subtree. 132 133 The old namespace declaration will be removed if present on the `node`. 134 135 :Parameters: 136 - `node`: the root of the subtree where namespaces should be replaced. 137 - `old_ns`: the namespace to replace. 138 - `new_ns`: the namespace to be used instead of old_ns. 139 :Types: 140 - `node`: `libxml2.xmlNode` 141 - `old_ns`: `libxml2.xmlNs` 142 - `new_ns`: `libxml2.xmlNs` 143 144 Both old_ns and new_ns may be None meaning no namespace set.""" 145 if old_ns is None: 146 old_ns__o = None 147 else: 148 old_ns__o = old_ns._o 149 if new_ns is None: 150 new_ns__o = None 151 else: 152 new_ns__o = new_ns._o 153 if node is None: 154 node__o = None 155 else: 156 node__o = node._o 157 _xmlextra.replace_ns(node__o, old_ns__o, new_ns__o) 158 if old_ns__o: 159 _xmlextra.remove_ns(node__o, old_ns__o)
160 161 pure_python = False 162 163 except ImportError: 164 ######################################################################### 165 # Pure python implementation (slow workarounds for libxml2 limitations) 166 #-----------------------------------------------------------------------
167 - class error(Exception):
168 """Exception raised on a stream parse error.""" 169 pass
170
171 - def _escape(data):
172 """Escape data for XML""" 173 data=data.replace("&","&amp;") 174 data=data.replace("<","&lt;") 175 data=data.replace(">","&gt;") 176 data=data.replace("'","&apos;") 177 data=data.replace('"',"&quot;") 178 return data
179
180 - class _SAXCallback(libxml2.SAXCallback):
181 """SAX events handler for the python-only stream parser."""
182 - def __init__(self, handler):
183 """Initialize the SAX handler. 184 185 :Parameters: 186 - `handler`: Object to handle stream start, end and stanzas. 187 :Types: 188 - `handler`: `StreamHandler` 189 """ 190 self._handler = handler 191 self._head = "" 192 self._tail = "" 193 self._current = "" 194 self._level = 0 195 self._doc = None 196 self._root = None
197
198 - def cdataBlock(self, data):
199 "" 200 if self._level>1: 201 self._current += _escape(data)
202
203 - def characters(self, data):
204 "" 205 if self._level>1: 206 self._current += _escape(data)
207
208 - def comment(self, content):
209 "" 210 pass
211
212 - def endDocument(self):
213 "" 214 pass
215
216 - def endElement(self, tag):
217 "" 218 self._current+="</%s>" % (tag,) 219 self._level -= 1 220 if self._level > 1: 221 return 222 if self._level==1: 223 xml=self._head+self._current+self._tail 224 doc=libxml2.parseDoc(xml) 225 try: 226 node = doc.getRootElement().children 227 try: 228 node1 = node.docCopyNode(self._doc, 1) 229 try: 230 self._root.addChild(node1) 231 self._handler.stanza(self._doc, node1) 232 except Exception, e: 233 node1.unlinkNode() 234 node1.freeNode() 235 del node1 236 raise e 237 finally: 238 del node 239 finally: 240 doc.freeDoc() 241 else: 242 xml=self._head+self._tail 243 doc=libxml2.parseDoc(xml) 244 try: 245 self._handler.stream_end(self._doc) 246 self._doc.freeDoc() 247 self._doc = None 248 self._root = None 249 finally: 250 doc.freeDoc()
251
252 - def error(self, msg):
253 "" 254 self._handler.error(msg)
255 256 fatalError = error 257 258 ignorableWhitespace = characters 259
260 - def reference(self, name):
261 "" 262 self._current += "&" + name + ";"
263
264 - def startDocument(self):
265 "" 266 pass
267
268 - def startElement(self, tag, attrs):
269 "" 270 s = "<"+tag 271 if attrs: 272 for a,v in attrs.items(): 273 s+=" %s='%s'" % (a,_escape(v)) 274 s += ">" 275 if self._level == 0: 276 self._head = s 277 self._tail = "</%s>" % (tag,) 278 xml=self._head+self._tail 279 self._doc = libxml2.parseDoc(xml) 280 self._handler.stream_start(self._doc) 281 self._root = self._doc.getRootElement() 282 elif self._level == 1: 283 self._current = s 284 else: 285 self._current += s 286 self._level += 1
287
288 - def warning(self):
289 "" 290 pass
291
292 - class _PythonReader:
293 """Python-only stream reader."""
294 - def __init__(self,handler):
295 """Initialize the reader. 296 297 :Parameters: 298 - `handler`: Object to handle stream start, end and stanzas. 299 :Types: 300 - `handler`: `StreamHandler` 301 """ 302 self.handler = handler 303 self.sax = _SAXCallback(handler) 304 self.parser = libxml2.createPushParser(self.sax, '', 0, 'stream')
305
306 - def feed(self, data):
307 """Feed the parser with a chunk of data. Apropriate methods 308 of `self.handler` will be called whenever something interesting is 309 found. 310 311 :Parameters: 312 - `data`: the chunk of data to parse. 313 :Types: 314 - `data`: `str`""" 315 return self.parser.parseChunk(data, len(data), 0)
316 317 _create_reader = _PythonReader 318
319 - def _get_ns(node):
320 """Get namespace of node. 321 322 :return: the namespace object or `None` if the node has no namespace 323 assigned. 324 :returntype: `libxml2.xmlNs`""" 325 try: 326 return node.ns() 327 except libxml2.treeError: 328 return None
329
330 - def replace_ns(node, old_ns, new_ns):
331 """Replace namespaces in a whole subtree. 332 333 :Parameters: 334 - `node`: the root of the subtree where namespaces should be replaced. 335 - `old_ns`: the namespace to replace. 336 - `new_ns`: the namespace to be used instead of old_ns. 337 :Types: 338 - `node`: `libxml2.xmlNode` 339 - `old_ns`: `libxml2.xmlNs` 340 - `new_ns`: `libxml2.xmlNs` 341 342 Both old_ns and new_ns may be None meaning no namespace set.""" 343 344 if old_ns is not None: 345 old_ns_uri = old_ns.content 346 old_ns_prefix = old_ns.name 347 else: 348 old_ns_uri = None 349 old_ns_prefix = None 350 351 ns = _get_ns(node) 352 if ns is None and old_ns is None: 353 node.setNs(new_ns) 354 elif ns and ns.content == old_ns_uri and ns.name == old_ns_prefix: 355 node.setNs(new_ns) 356 357 p = node.properties 358 while p: 359 ns = _get_ns(p) 360 if ns is None and old_ns is None: 361 p.setNs(new_ns) 362 if ns and ns.content == old_ns_uri and ns.name == old_ns_prefix: 363 p.setNs(new_ns) 364 p = p.next 365 366 n = node.children 367 while n: 368 if n.type == 'element': 369 skip_element = False 370 try: 371 nsd = n.nsDefs() 372 except libxml2.treeError: 373 nsd = None 374 while nsd: 375 if nsd.name == old_ns_prefix: 376 skip_element = True 377 break 378 nsd = nsd.next 379 if not skip_element: 380 replace_ns(n, old_ns, new_ns) 381 n = n.next
382 383 pure_python = True 384 385 ########################################################### 386 # Common code 387 #------------- 388
389 -def get_node_ns(xmlnode):
390 """Namespace of an XML node. 391 392 :Parameters: 393 - `xmlnode`: the XML node to query. 394 :Types: 395 - `xmlnode`: `libxml2.xmlNode` 396 397 :return: namespace of the node or `None` 398 :returntype: `libxml2.xmlNs`""" 399 try: 400 return xmlnode.ns() 401 except libxml2.treeError: 402 return None
403
404 -def get_node_ns_uri(xmlnode):
405 """Return namespace URI of an XML node. 406 407 :Parameters: 408 - `xmlnode`: the XML node to query. 409 :Types: 410 - `xmlnode`: `libxml2.xmlNode` 411 412 :return: namespace URI of the node or `None` 413 :returntype: `unicode`""" 414 ns=get_node_ns(xmlnode) 415 if ns: 416 return unicode(ns.getContent(),"utf-8") 417 else: 418 return None
419
420 -def xml_node_iter(nodelist):
421 """Iterate over sibling XML nodes. All types of nodes will be returned 422 (not only the elements). 423 424 Usually used to iterade over node's children like this:: 425 426 xml_node_iter(node.children) 427 428 :Parameters: 429 - `nodelist`: start node of the list. 430 :Types: 431 - `nodelist`: `libxml2.xmlNode` 432 """ 433 node = nodelist 434 while node: 435 yield node 436 node = node.next
437
438 -def xml_element_iter(nodelist):
439 """Iterate over sibling XML elements. Non-element nodes will be skipped. 440 441 Usually used to iterade over node's children like this:: 442 443 xml_node_iter(node.children) 444 445 :Parameters: 446 - `nodelist`: start node of the list. 447 :Types: 448 - `nodelist`: `libxml2.xmlNode` 449 """ 450 node = nodelist 451 while node: 452 if node.type == "element": 453 yield node 454 node = node.next
455
456 -def xml_element_ns_iter(nodelist, ns_uri):
457 """Iterate over sibling XML elements. Only elements in the given namespace will be returned. 458 459 Usually used to iterade over node's children like this:: 460 461 xml_node_iter(node.children) 462 463 :Parameters: 464 - `nodelist`: start node of the list. 465 :Types: 466 - `nodelist`: `libxml2.xmlNode` 467 """ 468 node = nodelist 469 while node: 470 if node.type == "element" and get_node_ns_uri(node)==ns_uri: 471 yield node 472 node = node.next
473 474 evil_characters_re=re.compile(r"[\000-\010\013\014\016-\037]",re.UNICODE) 475 utf8_replacement_char=u"\ufffd".encode("utf-8") 476
477 -def remove_evil_characters(s):
478 """Remove control characters (not allowed in XML) from a string.""" 479 if isinstance(s,unicode): 480 return evil_characters_re.sub(u"\ufffd",s) 481 else: 482 return evil_characters_re.sub(utf8_replacement_char,s)
483 484 bad_nsdef_replace_re=re.compile(r"^([^<]*\<[^><]*\s+)(xmlns=((\"[^\"]*\")|(\'[^\']*\')))") 485
486 -def safe_serialize(xmlnode):
487 """Serialize an XML element making sure the result is sane. 488 489 Remove control characters and invalid namespace declarations from the 490 result string. 491 492 :Parameters: 493 - `xmlnode`: the XML element to serialize. 494 :Types: 495 - `xmlnode`: `libxml2.xmlNode` 496 497 :return: UTF-8 encoded serialized and sanitized element. 498 :returntype: `string`""" 499 try: 500 ns = xmlnode.ns() 501 except libxml2.treeError: 502 ns = None 503 try: 504 nsdef = xmlnode.nsDefs() 505 except libxml2.treeError: 506 nsdef = None 507 s=xmlnode.serialize(encoding="UTF-8") 508 while nsdef: 509 if nsdef.name is None and (not ns or (nsdef.name, nsdef.content)!=(ns.name, ns.content)): 510 s = bad_nsdef_replace_re.sub("\\1",s,1) 511 break 512 nsdef = nsdef.next 513 s=remove_evil_characters(s) 514 return s
515
516 -class StreamReader:
517 """A simple push-parser interface for XML streams."""
518 - def __init__(self,handler):
519 """Initialize `StreamReader` object. 520 521 :Parameters: 522 - `handler`: handler object for the stream content 523 :Types: 524 - `handler`: `StreamHandler` derived class 525 """ 526 self.reader=_create_reader(handler) 527 self.lock=threading.RLock() 528 self.in_use=0
529 - def doc(self):
530 """Get the document being parsed. 531 532 :return: the document. 533 :returntype: `libxml2.xmlNode`""" 534 ret=self.reader.doc() 535 if ret: 536 return libxml2.xmlDoc(ret) 537 else: 538 return None
539 - def feed(self,s):
540 """Pass a string to the stream parser. 541 542 Parameters: 543 - `s`: string to parse. 544 Types: 545 - `s`: `str` 546 547 :return: `None` on EOF, `False` when whole input was parsed and `True` 548 if there is something still left in the buffer.""" 549 self.lock.acquire() 550 if self.in_use: 551 self.lock.release() 552 raise StreamParseError,"StreamReader.feed() is not reentrant!" 553 self.in_use=1 554 try: 555 return self.reader.feed(s) 556 finally: 557 self.in_use=0 558 self.lock.release()
559 560 561 # vi: sts=4 et sw=4 562