If you come to this article, you probably need to extract some data from some XML files, and print the result to the standard output or redirect it to a pipe.
My little script, xpq.py, can extract some data from XML files and display the result in the standard output. It uses libxml2 to achieve that.
Here is how to use it:
python xpq.py xmlfile xpathquery [option]
or
command | python xpq.py xpathquery [option]
option:
content : get the content of the current node (default)
name : get the name of the current node
type : get the type of the current node
parent : get the parent node name of the current node
node : get the representation of the current node
You will find some examples below.
Consider the following simple XML file (test.xml):
<?xml version="1.0" encoding="utf-8">
<root>
<element use="2">
<subelement id="1">
Sub-element text
</subelement>
<subelement id="2">
Sub-element 2 text
</subelement>
</element>
</root>
Now have a look at the following commands and their output to see what the script do:
$ python xpq.py test.xml /root/element/@use content 2 $ cat test.xml | python xpq.py /root/element/@use content 2 $ python xpq.py test.xml /root/element/subelement[@id=2] content Sub-element 2 text $ python xpq.py test.xml /root/element/@use name use $ python xpq.py test.xml /root/element/@use type attribute $ python xpq.py test.xml /root/element/subelement[@id=1] node <subelement id="1"> Sub-element text </subelement $ python xpq.py test.xml /root/element/subelement[@id=1] parent element
Here is the script (also available for download here, I would recommend it over copy-pasting as indentation is important in Python):
import sys
import libxml2
import os.path
import select
# Check if some data is in stdin, if yes, read its content to xmlbuffer
if select.select([sys.stdin,],[],[],0.0)[0]:
frompipe = True
xmlbuffer = sys.stdin.read()
else:
frompipe = False
# Codes to print bold text and reset to normal text
bold = "\033[1m"
reset = "\033[0;0m"
# Return the first node of the result of a XPath query, None else.
def xpq(docxml, query):
nodes = docxml.xpathEval(query)
if len(nodes) > 0:
return nodes[0]
return None;
# Print script usage
def printusage(errorcode):
print "Usage:", bold, "xpq xmlfile xpathquery", reset, "[option]"
print "or: command |", bold, "xpq xpathquery", reset, "[option] (Unix systems only)"
print "option:"
print " ", bold, "content", reset," : get the content of the current node (default)"
print " ", bold, "name", reset," : get the name of the current node"
print " ", bold, "type", reset," : get the type of the current node"
print " ", bold, "parent", reset," : get the parent node name of the current node"
print " ", bold, "node", reset," : get the representation of the current node"
sys.exit(errorcode);
# Check for arguments
argc = len(sys.argv)
getwhat = "content"
exist = False
if frompipe == True:
if argc >= 2:
query = sys.argv[1]
if argc == 3:
if sys.argv[2] == "content":
getwhat = "content"
elif sys.argv[2] == "name":
getwhat = "name"
elif sys.argv[2] == "type":
getwhat = "type"
elif sys.argv[2] == "parent":
getwhat = "parent"
elif sys.argv[2] == "node":
getwhat = "node"
else:
printusage(3)
elif argc > 3:
printusage(1)
else:
printusage(1)
else:
if argc >= 3:
if os.path.exists( sys.argv[1] ) == True:
exist = True
xmlfile = sys.argv[1]
else:
print "Input file not found!"
printusage(2)
query = sys.argv[2]
if argc == 4:
if sys.argv[3] == "content":
getwhat = "content"
elif sys.argv[3] == "name":
getwhat = "name"
elif sys.argv[3] == "type":
getwhat = "type"
elif sys.argv[3] == "parent":
getwhat = "parent"
elif sys.argv[3] == "node":
getwhat = "node"
else:
printusage(3)
elif argc > 4:
printusage(1)
else:
printusage(1)
# Print the result of the query
if exist == True or frompipe == True:
try:
if exist == True:
doc = libxml2.parseFile(xmlfile)
if frompipe == True:
doc = libxml2.parseDoc(xmlbuffer)
except:
print "Unable to load source file! Is it a valid XML file?"
printusage(4)
try:
node = xpq(doc, query)
if node == None:
print "[Error] No node was found"
except:
print "Invalid XPath query!"
doc.freeDoc()
sys.exit(6)
if node != None:
if getwhat == "content":
print node.content
elif getwhat == "name":
print node.name
elif getwhat == "type":
print node.type
elif getwhat == "parent":
print node.parent.name
elif getwhat == "node":
print node
doc.freeDoc()
else:
printusage(7)