Python Programming/XML Tools

From Wikibooks, open books for an open world
Jump to navigation Jump to search


Introduction

Python includes several modules for manipulating xml.


xml.sax.handler

Python Doc

import xml.sax.handler as saxhandler
import xml.sax as saxparser

class MyReport:
    def __init__(self):
        self.Y = 1


class MyCH(saxhandler.ContentHandler):
    def __init__(self, report):
        self.X = 1
        self.report = report

    def startDocument(self):
        print 'startDocument'

    def startElement(self, name, attrs):
        print 'Element:', name

report = MyReport()          #for future use
ch = MyCH(report)

xml = """\
<collection>
  <comic title=\"Sandman\" number='62'>
     <writer>Neil Gaiman</writer>
     <penciller pages='1-9,18-24'>Glyn Dillon</penciller>
     <penciller pages="10-17">Charles Vess</penciller>
  </comic>
</collection>
"""

print xml

saxparser.parseString(xml, ch)

xml.dom.minidom

An example of doing RSS feed parsing with DOM

from xml.dom import minidom as dom
import urllib2

def fetchPage(url):
    a = urllib2.urlopen(url)
    return ''.join(a.readlines())

def extract(page):
    a = dom.parseString(page)
    item = a.getElementsByTagName('item')
    for i in item:
        if i.hasChildNodes():
            t = i.getElementsByTagName('title')[0].firstChild.wholeText
            l = i.getElementsByTagName('link')[0].firstChild.wholeText
            d = i.getElementsByTagName('description')[0].firstChild.wholeText
            print t, l, d

if __name__=='__main__':
    page = fetchPage("http://rss.slashdot.org/Slashdot/slashdot")
    extract(page)

XML document provided by pyxml documentation.