[fix] html escape

This commit is contained in:
asciimoo 2013-11-18 16:47:20 +01:00
parent ad7c83e3f6
commit d0427d9bae

View file

@ -1,5 +1,5 @@
from HTMLParser import HTMLParser from HTMLParser import HTMLParser
import htmlentitydefs #import htmlentitydefs
import csv import csv
import codecs import codecs
import cStringIO import cStringIO
@ -17,8 +17,9 @@ class HTMLTextExtractor(HTMLParser):
self.result.append(unichr(codepoint)) self.result.append(unichr(codepoint))
def handle_entityref(self, name): def handle_entityref(self, name):
codepoint = htmlentitydefs.name2codepoint[name] #codepoint = htmlentitydefs.name2codepoint[name]
self.result.append(unichr(codepoint)) #self.result.append(unichr(codepoint))
self.result.append(name)
def get_text(self): def get_text(self):
return u''.join(self.result) return u''.join(self.result)