diff --git a/buku.py b/buku.py index 05bf0d8..1e9375b 100755 --- a/buku.py +++ b/buku.py @@ -2953,20 +2953,37 @@ def get_page_title(resp): parser = BukuHTMLParser() charset = 'utf-8' + soup = None + parsed_title = None try: - charset = 'utf-8' + from bs4 import BeautifulSoup + soup = BeautifulSoup(resp.data, 'html.parser') + except Exception as e: + logerr('get_page_title(): %s', e) + try: + charset_found = False if 'content-type' in resp.headers: _, params = cgi.parse_header(resp.headers['content-type']) if params.get('charset') is not None: charset = params.get('charset') + charset_found = True + if not charset_found and soup: + meta_tag = soup.find('meta', attrs={'http-equiv': 'Content-Type'}) + if meta_tag: + _, params = cgi.parse_header(meta_tag.attrs['content']) + charset = params.get('charset', charset) parser.feed(resp.data.decode(charset)) except Exception as e: + if isinstance(e, UnicodeDecodeError) and soup: + parsed_title = soup.find('title').text # Suppress Exception due to intentional self.reset() in BHTMLParser if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'): logerr('get_page_title(): %s', e) finally: - return re.sub('\s{2,}', ' ', parser.parsed_title) + if not parsed_title: + parsed_title = parser.parsed_title + return re.sub('\s{2,}', ' ', parsed_title) def gen_headers():