1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 """Extension to libxml2 for XMPP stream and stanza processing"""
20
21 __revision__="$Id: xmlextra.py,v 1.15 2004/10/11 18:33:51 jajcus Exp $"
22 __docformat__="restructuredtext en"
23
24 import sys
25 import libxml2
26 import threading
27 import re
28
29 from pyxmpp.exceptions import StreamParseError
30
31 common_doc = libxml2.newDoc("1.0")
32 common_root = common_doc.newChild(None,"root",None)
33 COMMON_NS = "http://pyxmpp.jajcus.net/xmlns/common"
34 common_ns = common_root.newNs(COMMON_NS, None)
35 common_root.setNs(common_ns)
36 common_doc.setRootElement(common_root)
37
39 """Base class for stream handler."""
42
47
49 """Process stream end."""
50 doc=libxml2.xmlDoc(_doc)
51 self.stream_end(doc)
52
54 """Process complete stanza."""
55 doc=libxml2.xmlDoc(_doc)
56 node=libxml2.xmlNode(_node)
57 self.stanza(doc,node)
58
60 """Called when the start tag of root element is encountered
61 in the stream.
62
63 :Parameters:
64 - `doc`: the document being parsed.
65 :Types:
66 - `doc`: `libxml2.xmlDoc`"""
67 print >>sys.stderr,"Unhandled stream start:",`doc.serialize()`
68
70 """Called when the end tag of root element is encountered
71 in the stream.
72
73 :Parameters:
74 - `doc`: the document being parsed.
75 :Types:
76 - `doc`: `libxml2.xmlDoc`"""
77 print >>sys.stderr,"Unhandled stream end",`doc.serialize()`
78
79 - def stanza(self, _unused, node):
80 """Called when the end tag of a direct child of the root
81 element is encountered in the stream.
82
83 Please note, that node will be removed from the document
84 and freed after this method returns. If it is needed after
85 that a copy must be made before the method returns.
86
87 :Parameters:
88 - `_unused`: the document being parsed.
89 - `node`: the (complete) element being processed
90 :Types:
91 - `_unused`: `libxml2.xmlDoc`
92 - `node`: `libxml2.xmlNode`"""
93 print >>sys.stderr,"Unhandled stanza",`node.serialize()`
94
96 """Called when an error is encountered in the stream.
97
98 :Parameters:
99 - `descr`: description of the error
100 :Types:
101 - `descr`: `str`"""
102 raise StreamParseError,descr
103
104 try:
105
106
107
108 from pyxmpp import _xmlextra
109 from pyxmpp._xmlextra import error
110
111 _create_reader = _xmlextra.sax_reader_new
112
114 """Replace namespaces in a whole subtree.
115
116 The old namespace declaration will be removed if present on the `node`.
117
118 :Parameters:
119 - `node`: the root of the subtree where namespaces should be replaced.
120 - `old_ns`: the namespace to replace.
121 - `new_ns`: the namespace to be used instead of old_ns.
122 :Types:
123 - `node`: `libxml2.xmlNode`
124 - `old_ns`: `libxml2.xmlNs`
125 - `new_ns`: `libxml2.xmlNs`
126
127 Both old_ns and new_ns may be None meaning no namespace set."""
128 if old_ns is None:
129 old_ns__o = None
130 else:
131 old_ns__o = old_ns._o
132 if new_ns is None:
133 new_ns__o = None
134 else:
135 new_ns__o = new_ns._o
136 if node is None:
137 node__o = None
138 else:
139 node__o = node._o
140 _xmlextra.replace_ns(node__o, old_ns__o, new_ns__o)
141 if old_ns__o:
142 _xmlextra.remove_ns(node__o, old_ns__o)
143
144 pure_python = False
145
146 except ImportError:
147
148
149
151 """Exception raised on a stream parse error."""
152 pass
153
155 """Escape data for XML"""
156 data=data.replace("&","&")
157 data=data.replace("<","<")
158 data=data.replace(">",">")
159 data=data.replace("'","'")
160 data=data.replace('"',""")
161 return data
162
164 """SAX events handler for the python-only stream parser."""
166 """Initialize the SAX handler.
167
168 :Parameters:
169 - `handler`: Object to handle stream start, end and stanzas.
170 :Types:
171 - `handler`: `StreamHandler`
172 """
173 self._handler = handler
174 self._head = ""
175 self._tail = ""
176 self._current = ""
177 self._level = 0
178 self._doc = None
179 self._root = None
180
182 ""
183 if self._level>1:
184 self._current += _escape(data)
185
187 ""
188 if self._level>1:
189 self._current += _escape(data)
190
194
198
200 ""
201 self._current+="</%s>" % (tag,)
202 self._level -= 1
203 if self._level > 1:
204 return
205 if self._level==1:
206 xml=self._head+self._current+self._tail
207 doc=libxml2.parseDoc(xml)
208 try:
209 node = doc.getRootElement().children
210 try:
211 node1 = node.docCopyNode(self._doc, 1)
212 try:
213 self._root.addChild(node1)
214 self._handler.stanza(self._doc, node1)
215 except:
216 node1.unlinkNode()
217 node1.freeNode()
218 del node1
219 finally:
220 del node
221 finally:
222 doc.freeDoc()
223 else:
224 xml=self._head+self._tail
225 doc=libxml2.parseDoc(xml)
226 try:
227 self._handler.stream_end(self._doc)
228 self._doc.freeDoc()
229 self._doc = None
230 self._root = None
231 finally:
232 doc.freeDoc()
233
235 ""
236 self._handler.error(msg)
237
238 fatalError = error
239
240 ignorableWhitespace = characters
241
243 ""
244 self._current += "&" + name + ";"
245
249
251 ""
252 s = "<"+tag
253 if attrs:
254 for a,v in attrs.items():
255 s+=" %s='%s'" % (a,_escape(v))
256 s += ">"
257 if self._level == 0:
258 self._head = s
259 self._tail = "</%s>" % (tag,)
260 xml=self._head+self._tail
261 self._doc = libxml2.parseDoc(xml)
262 self._handler.stream_start(self._doc)
263 self._root = self._doc.getRootElement()
264 elif self._level == 1:
265 self._current = s
266 else:
267 self._current += s
268 self._level += 1
269
273
275 """Python-only stream reader."""
277 """Initialize the reader.
278
279 :Parameters:
280 - `handler`: Object to handle stream start, end and stanzas.
281 :Types:
282 - `handler`: `StreamHandler`
283 """
284 self.handler = handler
285 self.sax = _SAXCallback(handler)
286 self.parser = libxml2.createPushParser(self.sax, '', 0, 'stream')
287
288 - def feed(self, data):
289 """Feed the parser with a chunk of data. Apropriate methods
290 of `self.handler` will be called whenever something interesting is
291 found.
292
293 :Parameters:
294 - `data`: the chunk of data to parse.
295 :Types:
296 - `data`: `str`"""
297 return self.parser.parseChunk(data, len(data), 0)
298
299 _create_reader = _PythonReader
300
302 """Get namespace of node.
303
304 :return: the namespace object or `None` if the node has no namespace
305 assigned.
306 :returntype: `libxml2.xmlNs`"""
307 try:
308 return node.ns()
309 except libxml2.treeError:
310 return None
311
313 """Replace namespaces in a whole subtree.
314
315 :Parameters:
316 - `node`: the root of the subtree where namespaces should be replaced.
317 - `old_ns`: the namespace to replace.
318 - `new_ns`: the namespace to be used instead of old_ns.
319 :Types:
320 - `node`: `libxml2.xmlNode`
321 - `old_ns`: `libxml2.xmlNs`
322 - `new_ns`: `libxml2.xmlNs`
323
324 Both old_ns and new_ns may be None meaning no namespace set."""
325
326 if old_ns is not None:
327 old_ns_uri = old_ns.content
328 old_ns_prefix = old_ns.name
329 else:
330 old_ns_uri = None
331 old_ns_prefix = None
332
333 ns = _get_ns(node)
334 if ns is None and old_ns is None:
335 node.setNs(new_ns)
336 elif ns and ns.content == old_ns_uri and ns.name == old_ns_prefix:
337 node.setNs(new_ns)
338
339 p = node.properties
340 while p:
341 ns = _get_ns(p)
342 if ns is None and old_ns is None:
343 p.setNs(new_ns)
344 if ns and ns.content == old_ns_uri and ns.name == old_ns_prefix:
345 p.setNs(new_ns)
346 p = p.next
347
348 n = node.children
349 while n:
350 if n.type == 'element':
351 skip_element = False
352 try:
353 nsd = n.nsDefs()
354 except libxml2.treeError:
355 nsd = None
356 while nsd:
357 if nsd.name == old_ns_prefix:
358 skip_element = True
359 break
360 nsd = nsd.next
361 if not skip_element:
362 replace_ns(n, old_ns, new_ns)
363 n = n.next
364
365 pure_python = True
366
367
368
369
370
372 """Namespace of an XML node.
373
374 :Parameters:
375 - `xmlnode`: the XML node to query.
376 :Types:
377 - `xmlnode`: `libxml2.xmlNode`
378
379 :return: namespace of the node or `None`
380 :returntype: `libxml2.xmlNs`"""
381 try:
382 return xmlnode.ns()
383 except libxml2.treeError:
384 return None
385
387 """Return namespace URI of an XML node.
388
389 :Parameters:
390 - `xmlnode`: the XML node to query.
391 :Types:
392 - `xmlnode`: `libxml2.xmlNode`
393
394 :return: namespace URI of the node or `None`
395 :returntype: `unicode`"""
396 ns=get_node_ns(xmlnode)
397 if ns:
398 return unicode(ns.getContent(),"utf-8")
399 else:
400 return None
401
403 """Iterate over sibling XML nodes. All types of nodes will be returned
404 (not only the elements).
405
406 Usually used to iterade over node's children like this::
407
408 xml_node_iter(node.children)
409
410 :Parameters:
411 - `nodelist`: start node of the list.
412 :Types:
413 - `nodelist`: `libxml2.xmlNode`
414 """
415 node = nodelist
416 while node:
417 yield node
418 node = node.next
419
421 """Iterate over sibling XML elements. Non-element nodes will be skipped.
422
423 Usually used to iterade over node's children like this::
424
425 xml_node_iter(node.children)
426
427 :Parameters:
428 - `nodelist`: start node of the list.
429 :Types:
430 - `nodelist`: `libxml2.xmlNode`
431 """
432 node = nodelist
433 while node:
434 if node.type == "element":
435 yield node
436 node = node.next
437
439 """Iterate over sibling XML elements. Only elements in the given namespace will be returned.
440
441 Usually used to iterade over node's children like this::
442
443 xml_node_iter(node.children)
444
445 :Parameters:
446 - `nodelist`: start node of the list.
447 :Types:
448 - `nodelist`: `libxml2.xmlNode`
449 """
450 node = nodelist
451 while node:
452 if node.type == "element" and get_node_ns_uri(node)==ns_uri:
453 yield node
454 node = node.next
455
456 evil_characters_re=re.compile(r"[\000-\010\013\014\016-\037]",re.UNICODE)
457 utf8_replacement_char=u"\ufffd".encode("utf-8")
458
465
466 bad_nsdef_replace_re=re.compile(r"^([^<]*\<[^><]*\s+)(xmlns=((\"[^\"]*\")|(\'[^\']*\')))")
467
469 """Serialize an XML element making sure the result is sane.
470
471 Remove control characters and invalid namespace declarations from the
472 result string.
473
474 :Parameters:
475 - `xmlnode`: the XML element to serialize.
476 :Types:
477 - `xmlnode`: `libxml2.xmlNode`
478
479 :return: UTF-8 encoded serialized and sanitized element.
480 :returntype: `string`"""
481 try:
482 ns = xmlnode.ns()
483 except libxml2.treeError:
484 ns = None
485 try:
486 nsdef = xmlnode.nsDefs()
487 except libxml2.treeError:
488 nsdef = None
489 s=xmlnode.serialize(encoding="UTF-8")
490 while nsdef:
491 if nsdef.name is None and (not ns or (nsdef.name, nsdef.content)!=(ns.name, ns.content)):
492 s = bad_nsdef_replace_re.sub("\\1",s,1)
493 break
494 nsdef = nsdef.next
495 s=remove_evil_characters(s)
496 return s
497
499 """A simple push-parser interface for XML streams."""
501 """Initialize `StreamReader` object.
502
503 :Parameters:
504 - `handler`: handler object for the stream content
505 :Types:
506 - `handler`: `StreamHandler` derived class
507 """
508 self.reader=_create_reader(handler)
509 self.lock=threading.RLock()
510 self.in_use=0
512 """Get the document being parsed.
513
514 :return: the document.
515 :returntype: `libxml2.xmlNode`"""
516 ret=self.reader.doc()
517 if ret:
518 return libxml2.xmlDoc(ret)
519 else:
520 return None
522 """Pass a string to the stream parser.
523
524 Parameters:
525 - `s`: string to parse.
526 Types:
527 - `s`: `str`
528
529 :return: `None` on EOF, `False` when whole input was parsed and `True`
530 if there is something still left in the buffer."""
531 self.lock.acquire()
532 if self.in_use:
533 self.lock.release()
534 raise StreamParseError,"StreamReader.feed() is not reentrant!"
535 self.in_use=1
536 try:
537 return self.reader.feed(s)
538 finally:
539 self.in_use=0
540 self.lock.release()
541
542
543
544