XML to JSON in Amara2.x

Basic idea

A while back Luis put this wishful item page "Serializing amara objects". With the new metadata extraction facility in Amara 2.x, it's a piece of cake to implement in a few lines, using declarative features in the XML.

# -*- coding: utf-8 -*-

BOOK_MODEL = '''<?xml version="1.0" encoding="UTF-8"?>
<books xmlns:ak="http://purl.org/xml3k/akara/xmlmodel">
    <book id="1" ak:resource="@id">
        <title ak:rel="'title'" ak:value=".">Code Generation in Action</title>
        <author>
          <first ak:rel="'first'" ak:value=".">Jack</first>
          <last ak:rel="'last'" ak:value=".">Herrington</last>
        </author>
        <publisher ak:rel="'publisher'" ak:value=".">Manning</publisher>
    </book>
</books>
'''

TEST_BOOK_XML = '''<?xml version="1.0" encoding="utf-8"?>
<books>
    <book id="1">
        <title>Code Generation in Action</title>
        <author><first>Jack</first><last>Herrington</last></author>
        <publisher>Manning</publisher>
    </book>
    <book id="2">
        <title>PHP Hacks</title>
        <author><first>Jack</first><last>Herrington</last></author>
        <publisher>O'Reilly</publisher>
    </book>
    <book id="3">
        <title>Podcasting Hacks</title>
        <author><first>Jack</first><last>Herrington</last></author>
        <publisher>O'Reilly</publisher>
    </book>
</books>
'''

from amara.bindery.model import *
from amara import bindery
from amara.lib import U
from itertools import *
from operator import *
book_model = examplotron_model(BOOK_MODEL)
doc = bindery.parse(TEST_BOOK_XML, model=book_model)
metadata = doc.books.xml_model.generate_metadata(doc)
#print dict(( (bid, list(row)) for (bid, row) in groupby(metadata, itemgetter(0)) ))
books = {}
for bid, row in groupby(metadata, itemgetter(0)):
    book = dict((r[1], U(r[2])) for r in row )
    books[bid] = book
import pprint
pprint.pprint(books)

Output:

{u'1': {u'first': u'Jack',
        u'last': u'Herrington',
        u'publisher': u'Manning',
        u'title': u'Code Generation in Action'},
 u'2': {u'first': u'Jack',
        u'last': u'Herrington',
        u'publisher': u"O'Reilly",
        u'title': u'PHP Hacks'},
 u'3': {u'first': u'Jack',
        u'last': u'Herrington',
        u'publisher': u"O'Reilly",
        u'title': u'Podcasting Hacks'}}

Note: the above is not JSON, though it's close. The best way to get actual JSON is simplejson. Just add the following to the end of the top listing:

import simplejson
simplejson.dump(books, sys.stdout, indent=4)

The output is:

{
    "1": {
        "publisher": "Manning", 
        "first": "Jack", 
        "last": "Herrington", 
        "title": "Code Generation in Action"
    }, 
    "3": {
        "publisher": "O'Reilly", 
        "first": "Jack", 
        "last": "Herrington", 
        "title": "Podcasting Hacks"
    }, 
    "2": {
        "publisher": "O'Reilly", 
        "first": "Jack", 
        "last": "Herrington", 
        "title": "PHP Hacks"
    }
}

Expanding on the theme

Let's make it a bit more sophisticated. This version treats authors as entities in themselves, and as such nicely handles the case of multiple authors.

# -*- coding: utf-8 -*-


BOOK_MODEL = '''<?xml version="1.0" encoding="UTF-8"?>
<books xmlns:ak="http://purl.org/xml3k/akara/xmlmodel">
    <book id="1" ak:resource="@id">
        <title ak:rel="name()" ak:value=".">Code Generation in Action</title>
        <author ak:rel="name()" ak:resource="">
        <!--
        If you don't like the generated resource IDs, try:
        <author ak:rel="name()" ak:resource="concat(first, last)">
        -->
          <first ak:rel="'given name'" ak:value=".">Jack</first>
          <last ak:rel="'surname'" ak:value=".">Herrington</last>
        </author>
        <publisher ak:rel="name()" ak:value=".">Manning</publisher>
    </book>
</books>
'''

TEST_BOOK_XML = '''<?xml version="1.0" encoding="utf-8"?>
<books>
    <book id="1">
        <title>Code Generation in Action</title>
        <author><first>Jack</first><last>Herrington</last></author>
        <publisher>Manning</publisher>
    </book>
    <book id="2">
        <title>PHP Hacks</title>
        <author><first>Jack</first><last>Herrington</last></author>
        <publisher>O'Reilly</publisher>
    </book>
    <book id="3">
        <title>Podcasting Hacks</title>
        <author><first>Jack</first><last>Herrington</last></author>
        <author><first>No</first><last>One</last></author>
        <publisher>O'Reilly</publisher>
    </book>
</books>
'''

from amara.bindery.model import *
from amara import bindery
from amara.lib import U
from itertools import *
from operator import *
from collections import defaultdict

book_model = examplotron_model(BOOK_MODEL)
doc = bindery.parse(TEST_BOOK_XML, model=book_model)
metadata = doc.books.xml_model.generate_metadata(doc)
books = {}
#Use sorted to ensure grouping by resource IDs
for bid, row in groupby(sorted(metadata), itemgetter(0)):
    books[bid] = {}
    #It's all crazy lazy, so use list to consume the iterator
    list( books[bid].setdefault(key, []).append(U(val)) for (i, key, val) in row )
import pprint
pprint.pprint(books)

Output:

{u'1': {u'author': [u'r2e0e1e3'],
        u'publisher': [u'Manning'],
        u'title': [u'Code Generation in Action']},
 u'2': {u'author': [u'r2e0e3e3'],
        u'publisher': [u"O'Reilly"],
        u'title': [u'PHP Hacks']},
 u'3': {u'author': [u'r2e0e5e3', u'r2e0e5e5'],
        u'publisher': [u"O'Reilly"],
        u'title': [u'Podcasting Hacks']},
 u'r2e0e1e3': {u'given name': [u'Jack'], u'surname': [u'Herrington']},
 u'r2e0e3e3': {u'given name': [u'Jack'], u'surname': [u'Herrington']},
 u'r2e0e5e3': {u'given name': [u'Jack'], u'surname': [u'Herrington']},
 u'r2e0e5e5': {u'given name': [u'No'], u'surname': [u'One']}}

See also

Amara/Recipes/XML_to_JSON (last edited 2011-04-21 07:15:56 by LuisMiguel)