Using TagSoup (a Java tool) with Amara
For more on TagSoup see "Tip: Rescue terrible HTML with TagSoup"
Here is code tested in Bright_Content (where it's used in brightcontent.util.tidy_content_element).
1 from subprocess import *
2 from Ft.Lib import Uri
3
4 import amara
5
6 cmdline = "java -jar tagsoup-1.2.jar --encoding='utf-8'"
7 URL = r"http://www.example.com"
8
9 process = Popen(cmdline, stdin=PIPE, stdout=PIPE, universal_newlines=True, shell=True)
10 unsouped, perr = process.communicate(input=Uri.UrlOpen(URL).read())
11 try:
12 doc = amara.parse(unsouped)
13 except:
14 #Don't use this blanket exception in non-example code :-) Oh, and use L10N for error messages
15 raise ValueError('Empty or unexpected output from the command line. Command line: "%s"'%TAGSOUP_CMDLINE)
16 print doc.html.head.title.xml()
17
Version of the recipe that uses temporary files. Consider this approach if the data to be tidied is greater than a few megabytes.
1 import urllib2
2 import amara
3 import cStringIO
4 import tempfile
5 import popen2
6
7 #java tagsoup cmdline
8 cmdline = "java -jar tagsoup-1.2.jar %s"
9 # url to parse
10 url = r"http://www.hitimewine.net/istar.asp?a=6&id=161153!1247"
11
12 # retrieving html file
13 html_file = tempfile.mktemp()
14 f = open (html_file, 'w')
15 f.write(urllib2.urlopen(url).read())
16 f.close()
17
18 # doing tagsoup
19 pout, pin, perr = popen2.popen3(cmdline % html_file)
20
21 # parsing with amara
22 doc = amara.parse(pout.read())
23 print doc.xml(indent=True)
24
A dangerous version of the recipe that should not be used.
1 #DO NOT USE: Naive approach that easily succumbs to buffer deadlock
2 import urllib2
3 import popen2
4 import amara
5
6 TAGSOUP_CMDLINE = "java -jar tagsoup-1.2.jar --encoding='utf-8'"
7 URL = r"http://www.example.com"
8
9 pout, pin, perr = popen2.popen3(TAGSOUP_CMDLINE)
10 pin.write(urllib2.urlopen(URL).read())
11 pin.close()
12 try:
13 doc = amara.parse(pout)
14 except:
15 #Don't use this blanket exception in non-example code :-) Oh, and use L10N for error messages
16 raise ValueError('Empty or unexpected output from the command line. Command line: "%s"'%TAGSOUP_CMDLINE)
17 print doc.html.head.title.xml()
18
Older version of the recipe
