If you come to this article, you probably need to extract some data from some XML files, and print the result to the standard output or redirect it to a pipe.
My little script, xpq.py, can extract some data from XML files and display the result in the standard output. It uses libxml2 to achieve that.
Here is how to use it:
python xpq.py xmlfile xpathquery [option]
or
command | python xpq.py xpathquery [option]
option:
content : get the content of the current node (default)
name : get the name of the current node
type : get the type of the current node
parent : get the parent node name of the current node
node : get the representation of the current node
You will find some examples below.
Consider the following simple XML file (test.xml):
<?xml version="1.0" encoding="utf-8"> <root> <element use="2"> <subelement id="1"> Sub-element text </subelement> <subelement id="2"> Sub-element 2 text </subelement> </element> </root>
Now have a look at the following commands and their output to see what the script do:
$ python xpq.py test.xml /root/element/@use content 2 $ cat test.xml | python xpq.py /root/element/@use content 2 $ python xpq.py test.xml /root/element/subelement[@id=2] content Sub-element 2 text $ python xpq.py test.xml /root/element/@use name use $ python xpq.py test.xml /root/element/@use type attribute $ python xpq.py test.xml /root/element/subelement[@id=1] node <subelement id="1"> Sub-element text </subelement $ python xpq.py test.xml /root/element/subelement[@id=1] parent element
Here is the script (also available for download here, I would recommend it over copy-pasting as indentation is important in Python):
import sys import libxml2 import os.path import select # Check if some data is in stdin, if yes, read its content to xmlbuffer if select.select([sys.stdin,],[],[],0.0)[0]: frompipe = True xmlbuffer = sys.stdin.read() else: frompipe = False # Codes to print bold text and reset to normal text bold = "\033[1m" reset = "\033[0;0m" # Return the first node of the result of a XPath query, None else. def xpq(docxml, query): nodes = docxml.xpathEval(query) if len(nodes) > 0: return nodes[0] return None; # Print script usage def printusage(errorcode): print "Usage:", bold, "xpq xmlfile xpathquery", reset, "[option]" print "or: command |", bold, "xpq xpathquery", reset, "[option] (Unix systems only)" print "option:" print " ", bold, "content", reset," : get the content of the current node (default)" print " ", bold, "name", reset," : get the name of the current node" print " ", bold, "type", reset," : get the type of the current node" print " ", bold, "parent", reset," : get the parent node name of the current node" print " ", bold, "node", reset," : get the representation of the current node" sys.exit(errorcode); # Check for arguments argc = len(sys.argv) getwhat = "content" exist = False if frompipe == True: if argc >= 2: query = sys.argv[1] if argc == 3: if sys.argv[2] == "content": getwhat = "content" elif sys.argv[2] == "name": getwhat = "name" elif sys.argv[2] == "type": getwhat = "type" elif sys.argv[2] == "parent": getwhat = "parent" elif sys.argv[2] == "node": getwhat = "node" else: printusage(3) elif argc > 3: printusage(1) else: printusage(1) else: if argc >= 3: if os.path.exists( sys.argv[1] ) == True: exist = True xmlfile = sys.argv[1] else: print "Input file not found!" printusage(2) query = sys.argv[2] if argc == 4: if sys.argv[3] == "content": getwhat = "content" elif sys.argv[3] == "name": getwhat = "name" elif sys.argv[3] == "type": getwhat = "type" elif sys.argv[3] == "parent": getwhat = "parent" elif sys.argv[3] == "node": getwhat = "node" else: printusage(3) elif argc > 4: printusage(1) else: printusage(1) # Print the result of the query if exist == True or frompipe == True: try: if exist == True: doc = libxml2.parseFile(xmlfile) if frompipe == True: doc = libxml2.parseDoc(xmlbuffer) except: print "Unable to load source file! Is it a valid XML file?" printusage(4) try: node = xpq(doc, query) if node == None: print "[Error] No node was found" except: print "Invalid XPath query!" doc.freeDoc() sys.exit(6) if node != None: if getwhat == "content": print node.content elif getwhat == "name": print node.name elif getwhat == "type": print node.type elif getwhat == "parent": print node.parent.name elif getwhat == "node": print node doc.freeDoc() else: printusage(7)