From 69281444fd74bc135c9fc763c49e5475e7c11e4b Mon Sep 17 00:00:00 2001 From: Arun Prakash Jana Date: Wed, 1 Aug 2018 05:53:28 +0530 Subject: [PATCH] Detect page charset from content-type field --- buku.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/buku.py b/buku.py index 5cbefb5..f012154 100755 --- a/buku.py +++ b/buku.py @@ -18,6 +18,7 @@ # along with Buku. If not, see . import argparse +import cgi import collections import html.parser as HTMLParser import json @@ -2948,9 +2949,14 @@ def get_page_title(resp): """ parser = BukuHTMLParser() + charset = 'utf-8' try: - parser.feed(resp.data.decode('utf-8')) + if 'content-type' in resp.headers: + _, params = cgi.parse_header(resp.headers['content-type']) + if 'charset' in params: + charset = params['charset'] + parser.feed(resp.data.decode(charset)) except Exception as e: # Suppress Exception due to intentional self.reset() in BHTMLParser if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'):