commit: e9467524741b67ef6ceaabf299931ae5bc32e9f6
parent: 432ec664a38e7357e432df09924a55d6426a8a55
Author: asciimoo <asciimoo@gmail.com>
Date: Fri, 8 Nov 2013 23:44:26 +0100
[enh] utils.py added
Diffstat:
1 file changed, 26 insertions(+), 0 deletions(-)
diff --git a/searx/utils.py b/searx/utils.py
@@ -0,0 +1,26 @@
+from HTMLParser import HTMLParser
+import htmlentitydefs
+
+class HTMLTextExtractor(HTMLParser):
+ def __init__(self):
+ HTMLParser.__init__(self)
+ self.result = [ ]
+
+ def handle_data(self, d):
+ self.result.append(d)
+
+ def handle_charref(self, number):
+ codepoint = int(number[1:], 16) if number[0] in (u'x', u'X') else int(number)
+ self.result.append(unichr(codepoint))
+
+ def handle_entityref(self, name):
+ codepoint = htmlentitydefs.name2codepoint[name]
+ self.result.append(unichr(codepoint))
+
+ def get_text(self):
+ return u''.join(self.result)
+
+def html_to_text(html):
+ s = HTMLTextExtractor()
+ s.feed(html)
+ return s.get_text()