From 92c6e5c7c7ee8b7e00088ae5dfb5f9a76572d19f Mon Sep 17 00:00:00 2001 From: rachmadaniHaryono Date: Wed, 1 Aug 2018 19:59:33 +0800 Subject: [PATCH] chg: dev: use bs4 as backup parser --- buku.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/buku.py b/buku.py index 05bf0d8..1e9375b 100755 --- a/buku.py +++ b/buku.py @@ -2953,20 +2953,37 @@ def get_page_title(resp): parser = BukuHTMLParser() charset = 'utf-8' + soup = None + parsed_title = None try: - charset = 'utf-8' + from bs4 import BeautifulSoup + soup = BeautifulSoup(resp.data, 'html.parser') + except Exception as e: + logerr('get_page_title(): %s', e) + try: + charset_found = False if 'content-type' in resp.headers: _, params = cgi.parse_header(resp.headers['content-type']) if params.get('charset') is not None: charset = params.get('charset') + charset_found = True + if not charset_found and soup: + meta_tag = soup.find('meta', attrs={'http-equiv': 'Content-Type'}) + if meta_tag: + _, params = cgi.parse_header(meta_tag.attrs['content']) + charset = params.get('charset', charset) parser.feed(resp.data.decode(charset)) except Exception as e: + if isinstance(e, UnicodeDecodeError) and soup: + parsed_title = soup.find('title').text # Suppress Exception due to intentional self.reset() in BHTMLParser if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'): logerr('get_page_title(): %s', e) finally: - return re.sub('\s{2,}', ' ', parser.parsed_title) + if not parsed_title: + parsed_title = parser.parsed_title + return re.sub('\s{2,}', ' ', parsed_title) def gen_headers():