Jump to content

Python Programming/XML Tools

From Wikibooks, open books for an open world


Introduction

[edit | edit source]

Python includes several modules for manipulating xml.

xml.sax.handler

[edit | edit source]

Python Doc

import xml.sax.handler as saxhandler
import xml.sax as saxparser

class MyReport:
    def __init__(self):
        self.Y = 1


class MyCH(saxhandler.ContentHandler):
    def __init__(self, report):
        self.X = 1
        self.report = report

    def startDocument(self):
        print('startDocument')

    def startElement(self, name, attrs):
        print('Element:', name)

report = MyReport()          #for future use
ch = MyCH(report)

xml = """\
<collection>
  <comic title=\"Sandman\" number='62'>
     <writer>Neil Gaiman</writer>
     <penciller pages='1-9,18-24'>Glyn Dillon</penciller>
     <penciller pages="10-17">Charles Vess</penciller>
  </comic>
</collection>
"""

print(xml)

saxparser.parseString(xml, ch)

xml.dom.minidom

[edit | edit source]

An example of doing RSS feed parsing with DOM

from xml.dom import minidom as dom
import urllib2

def fetchPage(url):
    a = urllib2.urlopen(url)
    return ''.join(a.readlines())

def extract(page):
    a = dom.parseString(page)
    item = a.getElementsByTagName('item')
    for i in item:
        if i.hasChildNodes():
            t = i.getElementsByTagName('title')[0].firstChild.wholeText
            l = i.getElementsByTagName('link')[0].firstChild.wholeText
            d = i.getElementsByTagName('description')[0].firstChild.wholeText
            print(t, l, d)

if __name__=='__main__':
    page = fetchPage("http://rss.slashdot.org/Slashdot/slashdot")
    extract(page)

XML document provided by pyxml documentation.