from javax.servlet.http import HttpServlet
import httplib
import urllib
import re
import sre
import sre_compile
import string
import types
import sys
CHARS = { '\x80' : ('EUR', 'euro'),
'\x81' : ' ',
'\x82' : (',', 'sbquo'),
'\x83' : ('f', 'fnof'),
'\x84' : (',,', 'bdquo'),
'\x85' : ('...', 'hellip'),
'\x86' : ('+', 'dagger'),
'\x87' : ('++', 'Dagger'),
'\x88' : ('^', 'caret'),
'\x89' : '%',
'\x8A' : ('S', 'Scaron'),
'\x8B' : ('<', 'lt;'),
'\x8C' : ('OE', 'OElig'),
'\x8D' : '?',
'\x8E' : 'Z',
'\x8F' : '?',
'\x90' : '?',
'\x91' : ("'", 'lsquo'),
'\x92' : ("'", 'rsquo'),
'\x93' : ('"', 'ldquo'),
'\x94' : ('"', 'rdquo'),
'\x95' : ('*', 'bull'),
'\x96' : ('-', 'ndash'),
'\x97' : ('--', 'mdash'),
'\x98' : ('~', 'tilde'),
'\x99' : ('(TM)', 'trade'),
'\x9a' : ('s', 'scaron'),
'\x9b' : ('>', 'gt'),
'\x9c' : ('oe', 'oelig'),
'\x9d' : '?',
'\x9e' : 'z',
'\x9f' : ('Y', 'Yuml'),
'\xa0' : (' ', 'nbsp'),
'\xa1' : ('!', 'iexcl'),
'\xa2' : ('c', 'cent'),
'\xa3' : ('GBP', 'pound'),
'\xa4' : ('$', 'curren'),
'\xa5' : ('YEN', 'yen'),
'\xa6' : ('|', 'brvbar'),
'\xa7' : ('S', 'sect'),
'\xa8' : ('..', 'uml'),
'\xa9' : ('', 'copy'),
'\xaa' : ('(th)', 'ordf'),
'\xab' : ('<<', 'laquo'),
'\xac' : ('!', 'not'),
'\xad' : (' ', 'shy'),
'\xae' : ('(R)', 'reg'),
'\xaf' : ('-', 'macr'),
'\xb0' : ('o', 'deg'),
'\xb1' : ('+-', 'plusmm'),
'\xb2' : ('2', 'sup2'),
'\xb3' : ('3', 'sup3'),
'\xb4' : ("'", 'acute'),
'\xb5' : ('u', 'micro'),
'\xb6' : ('P', 'para'),
'\xb7' : ('*', 'middot'),
'\xb8' : (',', 'cedil'),
'\xb9' : ('1', 'sup1'),
'\xba' : ('(th)', 'ordm'),
'\xbb' : ('>>', 'raquo'),
'\xbc' : ('1/4', 'frac14'),
'\xbd' : ('1/2', 'frac12'),
'\xbe' : ('3/4', 'frac34'),
'\xbf' : ('?', 'iquest'),
'\xc0' : ('A', "Agrave"),
'\xc1' : ('A', "Aacute"),
'\xc2' : ('A', "Acirc"),
'\xc3' : ('A', "Atilde"),
'\xc4' : ('A', "Auml"),
'\xc5' : ('A', "Aring"),
'\xc6' : ('AE', "Aelig"),
'\xc7' : ('C', "Ccedil"),
'\xc8' : ('E', "Egrave"),
'\xc9' : ('E', "Eacute"),
'\xca' : ('E', "Ecirc"),
'\xcb' : ('E', "Euml"),
'\xcc' : ('I', "Igrave"),
'\xcd' : ('I', "Iacute"),
'\xce' : ('I', "Icirc"),
'\xcf' : ('I', "Iuml"),
'\xd0' : ('D', "Eth"),
'\xd1' : ('N', "Ntilde"),
'\xd2' : ('O', "Ograve"),
'\xd3' : ('O', "Oacute"),
'\xd4' : ('O', "Ocirc"),
'\xd5' : ('O', "Otilde"),
'\xd6' : ('O', "Ouml"),
'\xd7' : ('*', "times"),
'\xd8' : ('O', "Oslash"),
'\xd9' : ('U', "Ugrave"),
'\xda' : ('U', "Uacute"),
'\xdb' : ('U', "Ucirc"),
'\xdc' : ('U', "Uuml"),
'\xdd' : ('Y', "Yacute"),
'\xde' : ('b', "Thorn"),
'\xdf' : ('B', "szlig"),
'\xe0' : ('a', "agrave"),
'\xe1' : ('a', "aacute"),
'\xe2' : ('a', "acirc"),
'\xe3' : ('a', "atilde"),
'\xe4' : ('a', "auml"),
'\xe5' : ('a', "aring"),
'\xe6' : ('ae', "aelig"),
'\xe7' : ('c', "ccedil"),
'\xe8' : ('e', "egrave"),
'\xe9' : ('e', "eacute"),
'\xea' : ('e', "ecirc"),
'\xeb' : ('e', "euml"),
'\xec' : ('i', "igrave"),
'\xed' : ('i', "iacute"),
'\xee' : ('i', "icirc"),
'\xef' : ('i', "iuml"),
'\xf0' : ('o', "eth"),
'\xf1' : ('n', "ntilde"),
'\xf2' : ('o', "ograve"),
'\xf3' : ('o', "oacute"),
'\xf4' : ('o', "ocirc"),
'\xf5' : ('o', "otilde"),
'\xf6' : ('o', "ouml"),
'\xf7' : ('/', "divide"),
'\xf8' : ('o', "oslash"),
'\xf9' : ('u', "ugrave"),
'\xfa' : ('u', "uacute"),
'\xfb' : ('u', "ucirc"),
'\xfc' : ('u', "uuml"),
'\xfd' : ('y', "yacute"),
'\xfe' : ('b', "thorn"),
'\xff' : ('y', "yuml"),
}
def _makeRE(limit):
"""Returns a regular expression object that will match special characters
up to the given limit."""
return re.compile("([\x80-\\x%s])" % limit, re.M)
ALL = _makeRE('ff')
ONLY_WINDOWS = _makeRE('9f')
def _replHTML(match):
"Replace the matched character with its HTML equivalent."
return _repl(match, 1)
def _repl(match, html=0):
"Replace the matched character with its HTML or ASCII equivalent."
g = match.group(0)
a = CHARS.get(g,g)
if type(a) == types.TupleType:
a = a[html]
if html:
a = '&' + a + ';'
return a
def _dammit(t, html=0, fixWindowsOnly=0):
"Turns ISO-Latin-1 into an ASCII representation, dammit."
r = ALL
if fixWindowsOnly:
r = ONLY_WINDOWS
m = _repl
if html:
m = _replHTML
return re.sub(r, m, t)
def asciiDammit(t, fixWindowsOnly=0):
"Turns ISO-Latin-1 into a plain ASCII approximation, dammit."
return _dammit(t, 0, fixWindowsOnly)
def htmlDammit(t, fixWindowsOnly=0):
"Turns ISO-Latin-1 into plain ASCII with HTML codes, dammit."
return _dammit(t, 1, fixWindowsOnly=fixWindowsOnly)
def demoronise(t):
"""Helper method named in honor of the original smart quotes
remover, The Demoroniser:
http://www.fourmilab.ch/webtools/demoroniser/"""
return asciiDammit(t, 1)
class curlyQuoteParser (HttpServlet):
def doGet(self,request,response):
self.doPost (request,response)
def doPost(self,request,response):
param_names = request.getParameterNames()
writer = response.getWriter()
response.setContentType ("text/xml")
this_url = request.getParameterValues("url")[0]
reply = urllib.urlopen(this_url)
text = reply.read()
final_text = asciiDammit(text)
final_text = string.replace(final_text, "aEUR(TM)","'")
final_text = string.replace(final_text, "aEURoe","'")
final_text = string.replace(final_text, "aEUR\"","-")
final_text = string.replace(final_text, "aEUR?","'")
final_text = string.replace(final_text, "<br>","<br />")
final_text = string.replace(final_text, " & "," & ")
writer.println(final_text)
writer.println("<!--" + this_url + "-->")
writer.println("<!-- Created with http://webservices.freshwaterlife.org/Quotes/curlyQuoteParser.py the MS Curly Quote parsing thingy from FreshwaterLife -->")
writer.println("<!-- The Jython Servlet version of AsciiDammit.py -->")
writer.println("<!-- phollands@freshwaterlife.org | http://www.freshwaterlife.org | Copyright 2006 -->")