Detect page charset from content-type field

This commit is contained in:
Arun Prakash Jana 2018-08-01 05:53:28 +05:30
parent 88c4ccbccb
commit 69281444fd

View File

@ -18,6 +18,7 @@
# along with Buku. If not, see <http://www.gnu.org/licenses/>.
import argparse
import cgi
import collections
import html.parser as HTMLParser
import json
@ -2948,9 +2949,14 @@ def get_page_title(resp):
"""
parser = BukuHTMLParser()
charset = 'utf-8'
try:
parser.feed(resp.data.decode('utf-8'))
if 'content-type' in resp.headers:
_, params = cgi.parse_header(resp.headers['content-type'])
if 'charset' in params:
charset = params['charset']
parser.feed(resp.data.decode(charset))
except Exception as e:
# Suppress Exception due to intentional self.reset() in BHTMLParser
if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'):