|
View:
New views
1 Messages
—
Rating Filter:
Alert me
|
|
|
sgml2asciidoc.pyHi,
We (BBN's Network Research group) wrote the following Python script earlier this year to convert some of our existing DocBook SGML documentation to AsciiDoc. While we don't expect it to work perfectly with everyone else's SGML without some modifications (features were added only on an as-needed basis), we're distributing it in the hope that others may find it useful. License is GPLv2. Regards, Nick Goffee BBN Technologies Cambridge, MA #!/usr/bin/env python2.4 """ sgml2asciidoc.py -- convert a subset of DocBook SGML to AsciiDoc $Revision: 1.15 $ $Date: 2008/03/14 18:15:36 $ DESCRIPTION Quick'n'dirty DocBook SGML-to-AsciiDoc converter using Python's built-in sgmllib. Absolutely no pretensions of compliance to the full DocBook spec; support for each tag and quirk was added on-the-fly as it was encountered in converting our own *.sgml docs. USAGE ./sgml2asciidoc.py < input.sgml > output.asciidoc OPTIONS -d Show some additional debugging output. -h Show this help string. COPYING Copyright (c) 2008 BBN Technologies Corp. This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see <http://www.gnu.org/licenses/>. """ import sys import traceback import re from sgmllib import SGMLParser from textwrap import TextWrapper class Tag: def __init__(self, name, attrs): self.name = name self.attrs = dict(attrs) self.data = "" self.paracount = 0 recognized_tags = set(( 'article', 'articleinfo', 'title', 'author', 'firstname', 'surname', 'revhistory', 'revision', 'revnumber', 'date', 'para', 'itemizedlist', 'orderedlist', 'listitem', 'ulink', 'screen', 'emphasis', 'command', 'note', 'filename', 'hostname', 'literal', 'bridgehead', 'formalpara', 'simplelist', 'member', )) sect_tags = set(( 'sect1', 'sect2', 'sect3', 'sect4', 'sect5', )) recognized_tags.update(sect_tags) # AsciiDoc only supports section levels 1-4, so replace DocBook # <sect5> w/ <sect4>; nobody's really going to notice the difference # that deep in the hierarchy. section_chars = ('=', '-', '~', '^', '+', '+') bullet_chars = ('-', '*') def print_tag_real(lineno, indent, s): print >> sys.stderr, "%04d %s%s" % (lineno, ' ' * indent * 4, s) def print_tag_null(lineno, indent, s): pass print_tag = print_tag_null class MyParser(SGMLParser): def __init__(self): SGMLParser.__init__(self) self.lineno = 0 self.stack = [] self.listlevel = 0 def dump_para(self, para): if self.listlevel > 0: if para.name == 'member': # <member> is a list item that doesn't need <para> # tags around it, so treat it like its own parent # (which will never exceed paracount == 1) assert self.stack[-1].name == 'simplelist' parent = para grandparent = None else: # for everything else, walk up the stack looking for # innermost enclosing list, and make that the # grandparent parent = self.stack[-1] assert parent.name == 'listitem' for grandparent in reversed(self.stack): if grandparent.name in ('itemizedlist', 'orderedlist'): break assert grandparent.name in ('itemizedlist', 'orderedlist') if parent.paracount == 0: # first paragraph in a list item, so prefix the first # line of the list item itself with a with a bullet, # then indent subsequent lines initial_indent = " " * (self.listlevel * 2) subsequent_indent = " " * (self.listlevel * 2) if grandparent and grandparent.name == 'orderedlist': bullet_char = '.' else: bullet_char = bullet_chars[(self.listlevel - 1) % len(bullet_chars)] initial_indent = ' ' * (self.listlevel * 2 - 2) \ + bullet_char + ' ' subsequent_indent = ' ' * (self.listlevel * 2) else: # in the rare case of more than one paragraph in a # single <listitem>, subsequent paragraphs are not # indented initial_indent = "" subsequent_indent = "" else: parent = self.stack[-1] assert parent.name in sect_tags or parent.name == 'note' initial_indent = "" subsequent_indent = "" data = para.data.strip() if data: if self.listlevel > 0: if parent.paracount > 0: print '+' else: elif parent.paracount > 0: wrapper = TextWrapper(width=70, initial_indent=initial_indent, subsequent_indent=subsequent_indent, break_long_words=False) print wrapper.fill(data) parent.paracount += 1 para.data = "" # reset it def unknown_starttag(self, name, attrs): # Handle some quirky special cases while self.stack: # Before beginning a list or <screen> inside a <para>, we # need to dump the para's text-so-far to stdout # immediately. Otherwise, it'll get dumped *after* the # list or <screen>, which is wrong. if name in ('itemizedlist', 'orderedlist', 'simplelist', 'screen') \ and self.stack[-1].name == 'para': # go ahead and dump paragraph-in-progress print_tag(self.lineno, len(self.stack), "-- dump_para() --") # pop it off stack because dump_para() expects this para = self.stack.pop() self.dump_para(para) # push it pack self.stack.append(para) break # SGML sloppiness fix: maybe this start tag is opening a # new <listitem> and implicitly closes the previous one elif self.stack[-1].name == name: self.unknown_endtag(self.stack[-1].name, "@@@") # Another SGML sloppiness fix: don't let sections be # nested inside anything other than sections! elif name in sect_tags and self.stack[-1].name not in sect_tags \ and self.stack[-1].name != 'article': self.unknown_endtag(self.stack[-1].name, "###") else: break # debug output print_tag(self.lineno, len(self.stack), name) # main processing switch for start tags if name in ('itemizedlist', 'orderedlist', 'simplelist'): self.listlevel += 1 elif name == 'note': print '=' * 70 tag = Tag(name, attrs) self.stack.append(tag) def handle_data(self, data): # just append between-tag text to the topmost tag on the stack if self.stack: self.stack[-1].data += data def unknown_endtag(self, name, mark=""): # Another SGML sloppiness fix: we may encounter, e.g., a # </itemizedlist> that implicitly closes the still-open # <listitem> while self.stack[-1].name != name: self.unknown_endtag(self.stack[-1].name, "!!!") tag = self.stack.pop() print_tag(self.lineno, len(self.stack), "/%s %s" % (name, mark)) # if it's a <literal> tag, then treat it as either <screen> # or <command> depending on whether it contains newlines if name == 'literal': if '\n' in tag.data: name = 'screen' else: name = 'command' if name == 'title': parent = self.stack[-1] assert parent.name in ('sect1', 'sect2', 'sect3', 'sect4', 'sect5', 'articleinfo', 'formalpara') print # blank line before each title data = tag.data.strip().replace('\n', ' ') if parent.name == 'formalpara': print ".%s" % data else: if parent.name == 'articleinfo': seclevel = 0 else: seclevel = int(parent.name[4]) if seclevel > 4: print '/' * 70 print "Was <sect%d> in SGML," % seclevel, print "but AsciiDoc only has section levels 1-4." print '/' * 70 if parent.attrs.has_key('id'): print "[[%s]]" % parent.attrs['id'] print data print section_chars[seclevel] * len(data) elif name == 'firstname': assert self.stack[-1].name == 'author' assert self.stack[-2].name == 'articleinfo' print tag.data.strip(), # space, no newline elif name == 'surname': assert self.stack[-1].name == 'author' assert self.stack[-2].name == 'articleinfo' print tag.data.strip().replace(' ', '_') elif name == 'date': assert self.stack[-1].name == 'revision' assert self.stack[-2].name == 'revhistory' print tag.data.strip() elif name in ('itemizedlist', 'orderedlist', 'simplelist'): self.listlevel -= 1 elif name in ('para', 'member'): self.dump_para(tag) elif name == 'listitem': parent = self.stack[-1] assert parent.name in ('itemizedlist', 'orderedlist', 'para') elif name == 'ulink': parent = self.stack[-1] assert parent.name == 'para' if ':' in tag.attrs['url']: # absolute URL parent.data += "%s[%s]" % (tag.attrs['url'], tag.data.strip()) else: # relative URL parent.data += "link:%s[%s]" % (tag.attrs['url'], tag.data.strip()) elif name == 'screen': if self.listlevel > 0: parent = self.stack[-1] if parent.name == 'para': parent = self.stack[-2] assert parent.name == 'listitem' if parent.paracount > 0: print '+' else: print '-' * 70 # seem to have an extraneous newline at each end, so this # is an ad-hoc solution to get rid of them print tag.data.strip('\n') print '-' * 70 elif name == 'emphasis': parent = self.stack[-1] assert parent.name == 'para' parent.data += "__%s__" % tag.data.strip() elif name in ('command', 'filename', 'hostname'): parent = self.stack[-1] assert parent.name in ('para', 'screen') if parent.name == 'screen': parent.data += tag.data.strip() else: parent.data += "++%s++" % tag.data.strip() elif name == 'note': print '=' * 70 elif name == 'bridgehead': print >> sys.stderr, \ "WARNING: <bridgehead> being interpreted as <sect4>; make " + \ " sure this is semantically faithful" # make it look like a sect4 data = tag.data.strip() print tag.data print '+' * len(tag.data) elif name in recognized_tags: if tag.data.strip(): print >> sys.stderr, \ "WARNING: unexpectedly non-empty tag '%s'" % name else: print >> sys.stderr, \ "WARNING: unrecognized tag '%s'" % name def unknown_charref(self, ref): print >> sys.stderr, \ "WARNING: unknown character reference '%s'" % ref def unknown_entityref(self, ref): print >> sys.stderr, \ "WARNING: unknown entity reference '%s'" % ref def report_unbalanced(self, name): print >> sys.stderr, \ "ERROR: unbalanced tag '%s'" % name sys.exit(1) def parse(self, stream): for line in stream: # somebody seems to like putting in paragraphs with a # single line of the form: # [to be filled in by so-and-so] # These confuse AsciiDoc since it's the same syntax used # for paragraph styles, so nop-quote them line = re.sub(r'^[[]([^]]*)[]]$', r'#[\1]#', line) self.lineno += 1 try: self.feed(line) except Exception: exctype, excvalue, tb = sys.exc_info() traceback.print_exception(exctype, excvalue, tb) print >> sys.stderr, "exception at input line %d" % self.lineno sys.exit(1) self.close() if __name__ == '__main__': if len(sys.argv) > 1: if sys.argv[1] == "-d": print_tag = print_tag_real elif sys.argv[1] in ("-h", "-help", "--help"): print __doc__ sys.exit(0) myparser = MyParser() myparser.parse(sys.stdin) _______________________________________________ asciidoc-discuss mailing list asciidoc-discuss@... http://lists.metaperl.com/cgi-bin/mailman/listinfo/asciidoc-discuss |
| Free embeddable forum powered by Nabble | Forum Help |