"""ASCII, Dammit

Stupid library to turn MS chars (like smart quotes) and ISO-Latin
chars into ASCII, dammit. Will do plain text approximations, or more
accurate HTML representations. Can also be jiggered to just fix the
smart quotes and leave the rest of ISO-Latin alone.

Sources:
 http://www.cs.tut.fi/~jkorpela/latin1/all.html
 http://www.webreference.com/html/reference/character/isolat1.html

1.0 Initial Release (2004-11-28)
"""

__author__ = "Leonard Richardson (leonardr@segfault.org)"
__version__ = "$Revision: 1.2 $"
__date__ = "$Date: 2004/11/29 02:18:51 $"
__copyright__ = "Copyright (c) 2004 Leonard Richardson"
__license__ = "Public domain"

import re
import string
import types

CHARS = { '\x80' : ('EUR', 'euro'),
          '\x81' : ' ',
          '\x82' : (',', 'sbquo'),
          '\x83' : ('f', 'fnof'),
          '\x84' : (',,', 'bdquo'),
          '\x85' : ('...', 'hellip'),
          '\x86' : ('+', 'dagger'),
          '\x87' : ('++', 'Dagger'),
          '\x88' : ('^', 'caret'),
          '\x89' : '%',
          '\x8A' : ('S', 'Scaron'),
          '\x8B' : ('<', 'lt;'),
          '\x8C' : ('OE', 'OElig'),
          '\x8D' : '?',
          '\x8E' : 'Z',
          '\x8F' : '?',
          '\x90' : '?',
          '\x91' : ("'", 'lsquo'),
          '\x92' : ("'", 'rsquo'),
          '\x93' : ('"', 'ldquo'),
          '\x94' : ('"', 'rdquo'),
          '\x95' : ('*', 'bull'),
          '\x96' : ('-', 'ndash'),
          '\x97' : ('--', 'mdash'),
          '\x98' : ('~', 'tilde'),
          '\x99' : ('(TM)', 'trade'),
          '\x9a' : ('s', 'scaron'),
          '\x9b' : ('>', 'gt'),
          '\x9c' : ('oe', 'oelig'),
          '\x9d' : '?',
          '\x9e' : 'z',
          '\x9f' : ('Y', 'Yuml'),
          '\xa0' : (' ', 'nbsp'),
          '\xa1' : ('!', 'iexcl'),
          '\xa2' : ('c', 'cent'),
          '\xa3' : ('GBP', 'pound'),
          '\xa4' : ('$', 'curren'), #This approximation is especially lame.
          '\xa5' : ('YEN', 'yen'),
          '\xa6' : ('|', 'brvbar'),
          '\xa7' : ('S', 'sect'),
          '\xa8' : ('..', 'uml'),
          '\xa9' : ('', 'copy'),
          '\xaa' : ('(th)', 'ordf'),
          '\xab' : ('<<', 'laquo'),
          '\xac' : ('!', 'not'),
          '\xad' : (' ', 'shy'),
          '\xae' : ('(R)', 'reg'),
          '\xaf' : ('-', 'macr'),
          '\xb0' : ('o', 'deg'),
          '\xb1' : ('+-', 'plusmm'),
          '\xb2' : ('2', 'sup2'),
          '\xb3' : ('3', 'sup3'),
          '\xb4' : ("'", 'acute'),
          '\xb5' : ('u', 'micro'),
          '\xb6' : ('P', 'para'),
          '\xb7' : ('*', 'middot'),
          '\xb8' : (',', 'cedil'),
          '\xb9' : ('1', 'sup1'),
          '\xba' : ('(th)', 'ordm'),
          '\xbb' : ('>>', 'raquo'),
          '\xbc' : ('1/4', 'frac14'),
          '\xbd' : ('1/2', 'frac12'),
          '\xbe' : ('3/4', 'frac34'),
          '\xbf' : ('?', 'iquest'),          
          '\xc0' : ('A', "Agrave"),
          '\xc1' : ('A', "Aacute"),
          '\xc2' : ('A', "Acirc"),
          '\xc3' : ('A', "Atilde"),
          '\xc4' : ('A', "Auml"),
          '\xc5' : ('A', "Aring"),
          '\xc6' : ('AE', "Aelig"),
          '\xc7' : ('C', "Ccedil"),
          '\xc8' : ('E', "Egrave"),
          '\xc9' : ('E', "Eacute"),
          '\xca' : ('E', "Ecirc"),
          '\xcb' : ('E', "Euml"),
          '\xcc' : ('I', "Igrave"),
          '\xcd' : ('I', "Iacute"),
          '\xce' : ('I', "Icirc"),
          '\xcf' : ('I', "Iuml"),
          '\xd0' : ('D', "Eth"),
          '\xd1' : ('N', "Ntilde"),
          '\xd2' : ('O', "Ograve"),
          '\xd3' : ('O', "Oacute"),
          '\xd4' : ('O', "Ocirc"),
          '\xd5' : ('O', "Otilde"),
          '\xd6' : ('O', "Ouml"),
          '\xd7' : ('*', "times"),
          '\xd8' : ('O', "Oslash"),
          '\xd9' : ('U', "Ugrave"),
          '\xda' : ('U', "Uacute"),
          '\xdb' : ('U', "Ucirc"),
          '\xdc' : ('U', "Uuml"),
          '\xdd' : ('Y', "Yacute"),
          '\xde' : ('b', "Thorn"),
          '\xdf' : ('B', "szlig"),
          '\xe0' : ('a', "agrave"),
          '\xe1' : ('a', "aacute"),
          '\xe2' : ('a', "acirc"),
          '\xe3' : ('a', "atilde"),
          '\xe4' : ('a', "auml"),
          '\xe5' : ('a', "aring"),
          '\xe6' : ('ae', "aelig"),
          '\xe7' : ('c', "ccedil"),
          '\xe8' : ('e', "egrave"),
          '\xe9' : ('e', "eacute"),
          '\xea' : ('e', "ecirc"),
          '\xeb' : ('e', "euml"),
          '\xec' : ('i', "igrave"),
          '\xed' : ('i', "iacute"),
          '\xee' : ('i', "icirc"),
          '\xef' : ('i', "iuml"),
          '\xf0' : ('o', "eth"),
          '\xf1' : ('n', "ntilde"),
          '\xf2' : ('o', "ograve"),
          '\xf3' : ('o', "oacute"),
          '\xf4' : ('o', "ocirc"),
          '\xf5' : ('o', "otilde"),
          '\xf6' : ('o', "ouml"),
          '\xf7' : ('/', "divide"),
          '\xf8' : ('o', "oslash"),
          '\xf9' : ('u', "ugrave"),
          '\xfa' : ('u', "uacute"),
          '\xfb' : ('u', "ucirc"),
          '\xfc' : ('u', "uuml"),
          '\xfd' : ('y', "yacute"),
          '\xfe' : ('b', "thorn"),
          '\xff' : ('y', "yuml"),
          }

def _makeRE(limit):
    """Returns a regular expression object that will match special characters
    up to the given limit."""
    return re.compile("([\x80-\\x%s])" % limit, re.M)
ALL = _makeRE('ff')
ONLY_WINDOWS = _makeRE('9f')

def _replHTML(match):
    "Replace the matched character with its HTML equivalent."
    return _repl(match, 1)
          
def _repl(match, html=0):
    "Replace the matched character with its HTML or ASCII equivalent."
    g = match.group(0)
    a = CHARS.get(g,g)
    if type(a) == types.TupleType:
        a = a[html]
        if html:
            a = '&' + a + ';'
    return a

def _dammit(t, html=0, fixWindowsOnly=0):
    "Turns ISO-Latin-1 into an ASCII representation, dammit."

    r = ALL
    if fixWindowsOnly:
        r = ONLY_WINDOWS
    m = _repl
    if html:
        m = _replHTML

    return re.sub(r, m, t)

def asciiDammit(t, fixWindowsOnly=0):
    "Turns ISO-Latin-1 into a plain ASCII approximation, dammit."
    return _dammit(t, 0, fixWindowsOnly)

def htmlDammit(t, fixWindowsOnly=0):
    "Turns ISO-Latin-1 into plain ASCII with HTML codes, dammit."
    return _dammit(t, 1, fixWindowsOnly=fixWindowsOnly)

def demoronise(t):
    """Helper method named in honor of the original smart quotes
    remover, The Demoroniser:

    http://www.fourmilab.ch/webtools/demoroniser/"""
    return asciiDammit(t, 1)

if __name__ == '__main__':

    french = '\x93Sacr\xe9 bleu!\x93'
    print "First we mangle some French."
    print asciiDammit(french)
    print htmlDammit(french)

    print
    print "And now we fix the MS-quotes but leave the French alone."
    print demoronise(french)
    print htmlDammit(french, 1)