XML to JSON in Amara2.x
Contents
Basic idea
A while back Luis put this wishful item page "Serializing amara objects". With the new metadata extraction facility in Amara 2.x, it's a piece of cake to implement in a few lines, using declarative features in the XML.
# -*- coding: utf-8 -*-
BOOK_MODEL = '''<?xml version="1.0" encoding="UTF-8"?>
<books xmlns:ak="http://purl.org/xml3k/akara/xmlmodel">
<book id="1" ak:resource="@id">
<title ak:rel="'title'" ak:value=".">Code Generation in Action</title>
<author>
<first ak:rel="'first'" ak:value=".">Jack</first>
<last ak:rel="'last'" ak:value=".">Herrington</last>
</author>
<publisher ak:rel="'publisher'" ak:value=".">Manning</publisher>
</book>
</books>
'''
TEST_BOOK_XML = '''<?xml version="1.0" encoding="utf-8"?>
<books>
<book id="1">
<title>Code Generation in Action</title>
<author><first>Jack</first><last>Herrington</last></author>
<publisher>Manning</publisher>
</book>
<book id="2">
<title>PHP Hacks</title>
<author><first>Jack</first><last>Herrington</last></author>
<publisher>O'Reilly</publisher>
</book>
<book id="3">
<title>Podcasting Hacks</title>
<author><first>Jack</first><last>Herrington</last></author>
<publisher>O'Reilly</publisher>
</book>
</books>
'''
from amara.bindery.model import *
from amara import bindery
from amara.lib import U
from itertools import *
from operator import *
book_model = examplotron_model(BOOK_MODEL)
doc = bindery.parse(TEST_BOOK_XML, model=book_model)
metadata = doc.books.xml_model.generate_metadata(doc)
#print dict(( (bid, list(row)) for (bid, row) in groupby(metadata, itemgetter(0)) ))
books = {}
for bid, row in groupby(metadata, itemgetter(0)):
book = dict((r[1], U(r[2])) for r in row )
books[bid] = book
import pprint
pprint.pprint(books)
Output:
{u'1': {u'first': u'Jack',
u'last': u'Herrington',
u'publisher': u'Manning',
u'title': u'Code Generation in Action'},
u'2': {u'first': u'Jack',
u'last': u'Herrington',
u'publisher': u"O'Reilly",
u'title': u'PHP Hacks'},
u'3': {u'first': u'Jack',
u'last': u'Herrington',
u'publisher': u"O'Reilly",
u'title': u'Podcasting Hacks'}}
Note: the above is not JSON, though it's close. The best way to get actual JSON is simplejson. Just add the following to the end of the top listing:
import simplejson
simplejson.dump(books, sys.stdout, indent=4)
The output is:
{
"1": {
"publisher": "Manning",
"first": "Jack",
"last": "Herrington",
"title": "Code Generation in Action"
},
"3": {
"publisher": "O'Reilly",
"first": "Jack",
"last": "Herrington",
"title": "Podcasting Hacks"
},
"2": {
"publisher": "O'Reilly",
"first": "Jack",
"last": "Herrington",
"title": "PHP Hacks"
}
}
Expanding on the theme
Let's make it a bit more sophisticated. This version treats authors as entities in themselves, and as such nicely handles the case of multiple authors.
# -*- coding: utf-8 -*-
BOOK_MODEL = '''<?xml version="1.0" encoding="UTF-8"?>
<books xmlns:ak="http://purl.org/xml3k/akara/xmlmodel">
<book id="1" ak:resource="@id">
<title ak:rel="name()" ak:value=".">Code Generation in Action</title>
<author ak:rel="name()" ak:resource="">
<!--
If you don't like the generated resource IDs, try:
<author ak:rel="name()" ak:resource="concat(first, last)">
-->
<first ak:rel="'given name'" ak:value=".">Jack</first>
<last ak:rel="'surname'" ak:value=".">Herrington</last>
</author>
<publisher ak:rel="name()" ak:value=".">Manning</publisher>
</book>
</books>
'''
TEST_BOOK_XML = '''<?xml version="1.0" encoding="utf-8"?>
<books>
<book id="1">
<title>Code Generation in Action</title>
<author><first>Jack</first><last>Herrington</last></author>
<publisher>Manning</publisher>
</book>
<book id="2">
<title>PHP Hacks</title>
<author><first>Jack</first><last>Herrington</last></author>
<publisher>O'Reilly</publisher>
</book>
<book id="3">
<title>Podcasting Hacks</title>
<author><first>Jack</first><last>Herrington</last></author>
<author><first>No</first><last>One</last></author>
<publisher>O'Reilly</publisher>
</book>
</books>
'''
from amara.bindery.model import *
from amara import bindery
from amara.lib import U
from itertools import *
from operator import *
from collections import defaultdict
book_model = examplotron_model(BOOK_MODEL)
doc = bindery.parse(TEST_BOOK_XML, model=book_model)
metadata = doc.books.xml_model.generate_metadata(doc)
books = {}
#Use sorted to ensure grouping by resource IDs
for bid, row in groupby(sorted(metadata), itemgetter(0)):
books[bid] = {}
#It's all crazy lazy, so use list to consume the iterator
list( books[bid].setdefault(key, []).append(U(val)) for (i, key, val) in row )
import pprint
pprint.pprint(books)
Output:
{u'1': {u'author': [u'r2e0e1e3'],
u'publisher': [u'Manning'],
u'title': [u'Code Generation in Action']},
u'2': {u'author': [u'r2e0e3e3'],
u'publisher': [u"O'Reilly"],
u'title': [u'PHP Hacks']},
u'3': {u'author': [u'r2e0e5e3', u'r2e0e5e5'],
u'publisher': [u"O'Reilly"],
u'title': [u'Podcasting Hacks']},
u'r2e0e1e3': {u'given name': [u'Jack'], u'surname': [u'Herrington']},
u'r2e0e3e3': {u'given name': [u'Jack'], u'surname': [u'Herrington']},
u'r2e0e5e3': {u'given name': [u'Jack'], u'surname': [u'Herrington']},
u'r2e0e5e5': {u'given name': [u'No'], u'surname': [u'One']}}
