chg: dev: use bs4 as backup parser

This commit is contained in:
rachmadaniHaryono 2018-08-01 19:59:33 +08:00
parent 1a0b5bd328
commit 92c6e5c7c7

21
buku.py
View File

@ -2953,20 +2953,37 @@ def get_page_title(resp):
parser = BukuHTMLParser()
charset = 'utf-8'
soup = None
parsed_title = None
try:
charset = 'utf-8'
from bs4 import BeautifulSoup
soup = BeautifulSoup(resp.data, 'html.parser')
except Exception as e:
logerr('get_page_title(): %s', e)
try:
charset_found = False
if 'content-type' in resp.headers:
_, params = cgi.parse_header(resp.headers['content-type'])
if params.get('charset') is not None:
charset = params.get('charset')
charset_found = True
if not charset_found and soup:
meta_tag = soup.find('meta', attrs={'http-equiv': 'Content-Type'})
if meta_tag:
_, params = cgi.parse_header(meta_tag.attrs['content'])
charset = params.get('charset', charset)
parser.feed(resp.data.decode(charset))
except Exception as e:
if isinstance(e, UnicodeDecodeError) and soup:
parsed_title = soup.find('title').text
# Suppress Exception due to intentional self.reset() in BHTMLParser
if (logger.isEnabledFor(logging.DEBUG) and str(e) != 'we should not get here!'):
logerr('get_page_title(): %s', e)
finally:
return re.sub('\s{2,}', ' ', parser.parsed_title)
if not parsed_title:
parsed_title = parser.parsed_title
return re.sub('\s{2,}', ' ', parsed_title)
def gen_headers():