sgml2asciidoc.py

View: New views
1 Messages — Rating Filter:   Alert me  

sgml2asciidoc.py

by Nick Goffee :: Rate this Message:

Reply to Author | View Threaded | Show Only this Message

Hi,

We (BBN's Network Research group) wrote the following Python script
earlier this year to convert some of our existing DocBook SGML
documentation to AsciiDoc.  While we don't expect it to work perfectly
with everyone else's SGML without some modifications (features were
added only on an as-needed basis), we're distributing it in the hope
that others may find it useful.  License is GPLv2.

Regards,

Nick Goffee
BBN Technologies
Cambridge, MA

#!/usr/bin/env python2.4

"""
sgml2asciidoc.py -- convert a subset of DocBook SGML to AsciiDoc

$Revision: 1.15 $
$Date: 2008/03/14 18:15:36 $

DESCRIPTION

  Quick'n'dirty DocBook SGML-to-AsciiDoc converter using Python's
  built-in sgmllib.  Absolutely no pretensions of compliance to the
  full DocBook spec; support for each tag and quirk was added
  on-the-fly as it was encountered in converting our own *.sgml docs.

USAGE

  ./sgml2asciidoc.py < input.sgml > output.asciidoc

OPTIONS

  -d  Show some additional debugging output.
  -h  Show this help string.

COPYING

  Copyright (c) 2008 BBN Technologies Corp.

  This program is free software; you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation; either version 2 of the License, or
  (at your option) any later version.

  This program is distributed in the hope that it will be useful, but
  WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  General Public License for more details.

  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""

import sys
import traceback
import re
from sgmllib import SGMLParser
from textwrap import TextWrapper

class Tag:
   
    def __init__(self, name, attrs):
        self.name = name
        self.attrs = dict(attrs)
        self.data = ""
        self.paracount = 0

recognized_tags = set((
    'article', 'articleinfo', 'title', 'author', 'firstname', 'surname',
    'revhistory', 'revision', 'revnumber', 'date',
    'para', 'itemizedlist', 'orderedlist', 'listitem',
    'ulink', 'screen', 'emphasis', 'command', 'note',
    'filename', 'hostname', 'literal',
    'bridgehead', 'formalpara',
    'simplelist', 'member',
))

sect_tags = set((
    'sect1', 'sect2', 'sect3', 'sect4', 'sect5',
))
recognized_tags.update(sect_tags)

# AsciiDoc only supports section levels 1-4, so replace DocBook
# <sect5> w/ <sect4>; nobody's really going to notice the difference
# that deep in the hierarchy.
section_chars = ('=', '-', '~', '^', '+', '+')

bullet_chars = ('-', '*')

def print_tag_real(lineno, indent, s):
    print >> sys.stderr, "%04d %s%s" % (lineno, ' ' * indent * 4, s)

def print_tag_null(lineno, indent, s):
    pass

print_tag = print_tag_null

class MyParser(SGMLParser):
   
    def __init__(self):
        SGMLParser.__init__(self)
        self.lineno    = 0
        self.stack     = []
        self.listlevel = 0
   
    def dump_para(self, para):
        if self.listlevel > 0:
            if para.name == 'member':
                # <member> is a list item that doesn't need <para>
                # tags around it, so treat it like its own parent
                # (which will never exceed paracount == 1)
                assert self.stack[-1].name == 'simplelist'
                parent = para
                grandparent = None
            else:
                # for everything else, walk up the stack looking for
                # innermost enclosing list, and make that the
                # grandparent
                parent = self.stack[-1]
                assert parent.name == 'listitem'
                for grandparent in reversed(self.stack):
                    if grandparent.name in ('itemizedlist', 'orderedlist'):
                        break
                assert grandparent.name in ('itemizedlist', 'orderedlist')
            if parent.paracount == 0:
                # first paragraph in a list item, so prefix the first
                # line of the list item itself with a with a bullet,
                # then indent subsequent lines
                initial_indent = " " * (self.listlevel * 2)
                subsequent_indent = " " * (self.listlevel * 2)
                if grandparent and grandparent.name == 'orderedlist':
                    bullet_char = '.'
                else:
                    bullet_char = bullet_chars[(self.listlevel - 1)
                                               % len(bullet_chars)]
                initial_indent = ' ' * (self.listlevel * 2 - 2) \
                    + bullet_char + ' '
                subsequent_indent = ' ' * (self.listlevel * 2)
            else:
                # in the rare case of more than one paragraph in a
                # single <listitem>, subsequent paragraphs are not
                # indented
                initial_indent = ""
                subsequent_indent = ""
        else:
            parent = self.stack[-1]
            assert parent.name in sect_tags or parent.name == 'note'
            initial_indent = ""
            subsequent_indent = ""
        data = para.data.strip()
        if data:
            if self.listlevel > 0:
                if parent.paracount > 0:
                    print '+'
                else:
                    print
            elif parent.paracount > 0:
                print
            wrapper = TextWrapper(width=70,
                                  initial_indent=initial_indent,
                                  subsequent_indent=subsequent_indent,
                                  break_long_words=False)
            print wrapper.fill(data)
        parent.paracount += 1
        para.data = ""  # reset it
   
    def unknown_starttag(self, name, attrs):

        # Handle some quirky special cases
        while self.stack:
           
            # Before beginning a list or <screen> inside a <para>, we
            # need to dump the para's text-so-far to stdout
            # immediately.  Otherwise, it'll get dumped *after* the
            # list or <screen>, which is wrong.
            if name in ('itemizedlist', 'orderedlist', 'simplelist', 'screen') \
                    and self.stack[-1].name == 'para':
                # go ahead and dump paragraph-in-progress
                print_tag(self.lineno, len(self.stack), "-- dump_para() --")
                # pop it off stack because dump_para() expects this
                para = self.stack.pop()
                self.dump_para(para)
                # push it pack
                self.stack.append(para)
                break
           
            # SGML sloppiness fix: maybe this start tag is opening a
            # new <listitem> and implicitly closes the previous one
            elif self.stack[-1].name == name:
                self.unknown_endtag(self.stack[-1].name, "@@@")
           
            # Another SGML sloppiness fix: don't let sections be
            # nested inside anything other than sections!
            elif name in sect_tags and self.stack[-1].name not in sect_tags \
                    and self.stack[-1].name != 'article':
                self.unknown_endtag(self.stack[-1].name, "###")
           
            else:
                break
           
        # debug output
        print_tag(self.lineno, len(self.stack), name)

        # main processing switch for start tags
        if name in ('itemizedlist', 'orderedlist', 'simplelist'):
            self.listlevel += 1
        elif name == 'note':
            print '=' * 70

        tag = Tag(name, attrs)
        self.stack.append(tag)

    def handle_data(self, data):
        # just append between-tag text to the topmost tag on the stack
        if self.stack:
            self.stack[-1].data += data

    def unknown_endtag(self, name, mark=""):

        # Another SGML sloppiness fix: we may encounter, e.g., a
        # </itemizedlist> that implicitly closes the still-open
        # <listitem>
        while self.stack[-1].name != name:
            self.unknown_endtag(self.stack[-1].name, "!!!")

        tag = self.stack.pop()
        print_tag(self.lineno, len(self.stack), "/%s %s" % (name, mark))

        # if it's a <literal> tag, then treat it as either <screen>
        # or <command> depending on whether it contains newlines
        if name == 'literal':
            if '\n' in tag.data:
                name = 'screen'
            else:
                name = 'command'

        if name == 'title':
            parent = self.stack[-1]
            assert parent.name in ('sect1', 'sect2', 'sect3', 'sect4', 'sect5',
                                   'articleinfo', 'formalpara')
            print  # blank line before each title
            data = tag.data.strip().replace('\n', ' ')
            if parent.name == 'formalpara':
                print ".%s" % data
            else:
                if parent.name == 'articleinfo':
                    seclevel = 0
                else:
                    seclevel = int(parent.name[4])
                    if seclevel > 4:
                        print '/' * 70
                        print "Was <sect%d> in SGML," % seclevel,
                        print "but AsciiDoc only has section levels 1-4."
                        print '/' * 70
                    if parent.attrs.has_key('id'):
                        print "[[%s]]" % parent.attrs['id']
                print data
                print section_chars[seclevel] * len(data)
       
        elif name == 'firstname':
            assert self.stack[-1].name == 'author'
            assert self.stack[-2].name == 'articleinfo'
            print tag.data.strip(),  # space, no newline
       
        elif name == 'surname':
            assert self.stack[-1].name == 'author'
            assert self.stack[-2].name == 'articleinfo'
            print tag.data.strip().replace(' ', '_')
       
        elif name == 'date':
            assert self.stack[-1].name == 'revision'
            assert self.stack[-2].name == 'revhistory'
            print tag.data.strip()
       
        elif name in ('itemizedlist', 'orderedlist', 'simplelist'):
            self.listlevel -= 1

        elif name in ('para', 'member'):
            self.dump_para(tag)

        elif name == 'listitem':
            parent = self.stack[-1]
            assert parent.name in ('itemizedlist', 'orderedlist', 'para')
       
        elif name == 'ulink':
            parent = self.stack[-1]
            assert parent.name == 'para'
            if ':' in tag.attrs['url']:  # absolute URL
                parent.data += "%s[%s]" % (tag.attrs['url'], tag.data.strip())
            else:  # relative URL
                parent.data += "link:%s[%s]" % (tag.attrs['url'],
                                              tag.data.strip())
       
        elif name == 'screen':
            if self.listlevel > 0:
                parent = self.stack[-1]
                if parent.name == 'para':
                    parent = self.stack[-2]
                assert parent.name == 'listitem'
                if parent.paracount > 0:
                    print '+'
            else:
                print
            print '-' * 70
            # seem to have an extraneous newline at each end, so this
            # is an ad-hoc solution to get rid of them
            print tag.data.strip('\n')
            print '-' * 70
       
        elif name == 'emphasis':
            parent = self.stack[-1]
            assert parent.name == 'para'
            parent.data += "__%s__" % tag.data.strip()
       
        elif name in ('command', 'filename', 'hostname'):
            parent = self.stack[-1]
            assert parent.name in ('para', 'screen')
            if parent.name == 'screen':
                parent.data += tag.data.strip()
            else:
                parent.data += "++%s++" % tag.data.strip()
       
        elif name == 'note':
            print '=' * 70
       
        elif name == 'bridgehead':
            print >> sys.stderr, \
                "WARNING: <bridgehead> being interpreted as <sect4>; make " + \
                "         sure this is semantically faithful"
            # make it look like a sect4
            data = tag.data.strip()
            print tag.data
            print '+' * len(tag.data)
       
        elif name in recognized_tags:
            if tag.data.strip():
                print >> sys.stderr, \
                    "WARNING: unexpectedly non-empty tag '%s'" % name
       
        else:
            print >> sys.stderr, \
                "WARNING: unrecognized tag '%s'" % name

    def unknown_charref(self, ref):
        print >> sys.stderr, \
            "WARNING: unknown character reference '%s'" % ref

    def unknown_entityref(self, ref):
        print >> sys.stderr, \
            "WARNING: unknown entity reference '%s'" % ref

    def report_unbalanced(self, name):
        print >> sys.stderr, \
            "ERROR: unbalanced tag '%s'" % name
        sys.exit(1)

    def parse(self, stream):
        for line in stream:
            # somebody seems to like putting in paragraphs with a
            # single line of the form:
            # [to be filled in by so-and-so]
            # These confuse AsciiDoc since it's the same syntax used
            # for paragraph styles, so nop-quote them
            line = re.sub(r'^[[]([^]]*)[]]$', r'#[\1]#', line)
            self.lineno += 1
            try:
                self.feed(line)
            except Exception:
                exctype, excvalue, tb = sys.exc_info()
                traceback.print_exception(exctype, excvalue, tb)
                print >> sys.stderr, "exception at input line %d" % self.lineno
                sys.exit(1)
        self.close()

if __name__ == '__main__':
    if len(sys.argv) > 1:
        if sys.argv[1] == "-d":
            print_tag = print_tag_real
        elif sys.argv[1] in ("-h", "-help", "--help"):
            print __doc__
            sys.exit(0)
    myparser = MyParser()
    myparser.parse(sys.stdin)

_______________________________________________
asciidoc-discuss mailing list
asciidoc-discuss@...
http://lists.metaperl.com/cgi-bin/mailman/listinfo/asciidoc-discuss